1305 lines
42 KiB
HolyC
1305 lines
42 KiB
HolyC
#define GROWABLE_STRING_INCREMENT_SIZE 16
|
|
|
|
#define HTML_STATE_INVALID 0
|
|
#define HTML_STATE_DATA 1
|
|
#define HTML_STATE_RCDATA 2
|
|
#define HTML_STATE_RAWTEXT 3
|
|
#define HTML_STATE_SCRIPT_DATA 4
|
|
#define HTML_STATE_PLAINTEXT 5
|
|
#define HTML_STATE_TAG_OPEN 6
|
|
#define HTML_STATE_END_TAG_OPEN 7
|
|
#define HTML_STATE_TAG_NAME 8
|
|
#define HTML_STATE_RCDATA_LESS_THAN_SIGN 9
|
|
#define HTML_STATE_RCDATA_END_TAG_OPEN 10
|
|
#define HTML_STATE_RCDATA_END_TAG_NAME 11
|
|
#define HTML_STATE_RAWTEXT_LESS_THAN_SIGN 12
|
|
#define HTML_STATE_RAWTEXT_END_TAG_OPEN 13
|
|
#define HTML_STATE_RAWTEXT_END_TAG_NAME 14
|
|
#define HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN 15
|
|
#define HTML_STATE_SCRIPT_DATA_END_TAG_OPEN 16
|
|
#define HTML_STATE_SCRIPT_DATA_END_TAG_NAME 17
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPE_START 18
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH 19
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPED 20
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH 21
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH 22
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN 23
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN 24
|
|
#define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME 25
|
|
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START 26
|
|
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED 27
|
|
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH 28
|
|
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH 29
|
|
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN 30
|
|
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END 31
|
|
#define HTML_STATE_BEFORE_ATTRIBUTE_NAME 32
|
|
#define HTML_STATE_ATTRIBUTE_NAME 33
|
|
#define HTML_STATE_AFTER_ATTRIBUTE_NAME 34
|
|
#define HTML_STATE_BEFORE_ATTRIBUTE_VALUE 35
|
|
#define HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED 36
|
|
#define HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED 37
|
|
#define HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED 38
|
|
#define HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED 39
|
|
#define HTML_STATE_SELF_CLOSING_START_TAG 40
|
|
#define HTML_STATE_BOGUS_COMMENT 41
|
|
#define HTML_STATE_MARKUP_DECLARATION_OPEN 42
|
|
#define HTML_STATE_COMMENT_START 43
|
|
#define HTML_STATE_COMMENT_START_DASH 44
|
|
#define HTML_STATE_COMMENT 45
|
|
#define HTML_STATE_COMMENT_LESS_THAN_SIGN 46
|
|
#define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG 47
|
|
#define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH 48
|
|
#define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH 49
|
|
#define HTML_STATE_COMMENT_END_DASH 50
|
|
#define HTML_STATE_COMMENT_END 51
|
|
#define HTML_STATE_COMMENT_END_BANG 52
|
|
#define HTML_STATE_DOCTYPE 53
|
|
#define HTML_STATE_BEFORE_DOCTYPE_NAME 54
|
|
#define HTML_STATE_DOCTYPE_NAME 55
|
|
#define HTML_STATE_AFTER_DOCTYPE_NAME 56
|
|
#define HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD 57
|
|
#define HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER 58
|
|
#define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED 59
|
|
#define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED 60
|
|
#define HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER 61
|
|
#define HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS 62
|
|
#define HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD 63
|
|
#define HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER 64
|
|
#define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED 65
|
|
#define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED 66
|
|
#define HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER 67
|
|
#define HTML_STATE_BOGUS_DOCTYPE 68
|
|
#define HTML_STATE_CDATA_SECTION 69
|
|
#define HTML_STATE_CDATA_SECTION_BRACKET 70
|
|
#define HTML_STATE_CDATA_SECTION_END 71
|
|
#define HTML_STATE_CHARACTER_REFERENCE 72
|
|
#define HTML_STATE_NAMED_CHARACTER_REFERENCE 73
|
|
#define HTML_STATE_AMBIGUOUS_AMPERSAND 74
|
|
#define HTML_STATE_NUMERIC_CHARACTER_REFERENCE 75
|
|
#define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START 76
|
|
#define HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START 77
|
|
#define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE 78
|
|
#define HTML_STATE_DECIMAL_CHARACTER_REFERENCE 79
|
|
#define HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END 80
|
|
|
|
class @html_dom_node : JsonElement
|
|
{
|
|
@html_dom_node* parentNode;
|
|
U8 tagName[32];
|
|
JsonObject* attributes;
|
|
JsonArray* children;
|
|
|
|
U8* text;
|
|
|
|
I64 display;
|
|
I64 textAlign;
|
|
|
|
F64 width;
|
|
F64 height;
|
|
I64 widthDistanceType;
|
|
I64 heightDistanceType;
|
|
|
|
@css_area margin;
|
|
@css_border border;
|
|
@css_area padding;
|
|
|
|
// for reflow
|
|
@css_area resolvedMargin;
|
|
@css_border resolvedBorder;
|
|
@css_area resolvedPadding;
|
|
I64 resolvedWidth;
|
|
I64 resolvedHeight;
|
|
|
|
U32 color;
|
|
U32 backgroundColor;
|
|
U32 linethroughColor;
|
|
U32 underlineColor;
|
|
|
|
U8* fontFamily;
|
|
I64 fontSize;
|
|
I64 fontWeight;
|
|
Bool italic;
|
|
};
|
|
|
|
class @html_input_buffer
|
|
{
|
|
U8* data;
|
|
I64 size;
|
|
I64 pos;
|
|
};
|
|
|
|
class @html_tokenizer
|
|
{
|
|
@html_input_buffer inputBuffer;
|
|
I64 state;
|
|
I64 returnState;
|
|
U8 currentInputChar;
|
|
JsonKey* currentAttribute;
|
|
@html_dom_node* appendNode;
|
|
@html_dom_node* currentNode;
|
|
@html_dom_node* originNode;
|
|
I64 nodeTreeDepth;
|
|
I64 dataStateCounter;
|
|
@html_input_buffer tempBuffer;
|
|
Bool consumeTempBuffer;
|
|
I64 numOfImgNodes;
|
|
CTask* mem_task;
|
|
};
|
|
|
|
I64 @round_value_up(I64 numToRound, I64 multiple)
|
|
{
|
|
if (multiple == 0)
|
|
return numToRound;
|
|
I64 remainder = Abs(numToRound) % multiple;
|
|
if (remainder == 0)
|
|
return numToRound;
|
|
if (numToRound < 0)
|
|
return -(Abs(numToRound) - remainder);
|
|
else
|
|
return numToRound + multiple - remainder;
|
|
}
|
|
|
|
U8* @init_growable_string(CTask* mem_task) { return CAlloc(GROWABLE_STRING_INCREMENT_SIZE, mem_task); }
|
|
|
|
U8* @append_char_to_growable_string(U8* s, I64 char, CTask* mem_task)
|
|
{
|
|
I64 oldBufSize =
|
|
@round_value_up(StrLen(s), GROWABLE_STRING_INCREMENT_SIZE - 1);
|
|
I64 newBufSize =
|
|
@round_value_up(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1);
|
|
if (newBufSize > oldBufSize) {
|
|
U8* newBuf = CAlloc(
|
|
@round_value_up(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1) * 2, mem_task);
|
|
StrCpy(newBuf, s);
|
|
newBuf[StrLen(newBuf)] = char;
|
|
Free(s);
|
|
return newBuf;
|
|
} else {
|
|
s[StrLen(s)] = char;
|
|
return s;
|
|
}
|
|
}
|
|
|
|
U0 @empty_temp_buffer(@html_tokenizer* t)
|
|
{
|
|
MemSet(t->tempBuffer.data, NULL, 512);
|
|
t->tempBuffer.size = 0;
|
|
t->tempBuffer.pos = 0;
|
|
}
|
|
|
|
U0 @recalculate_temp_buffer_size(@html_tokenizer* t)
|
|
{
|
|
t->tempBuffer.size = StrLen(t->tempBuffer.data);
|
|
t->tempBuffer.pos = 0;
|
|
}
|
|
|
|
U0 @replace_temp_buffer_with_named_character_reference(@html_tokenizer* t)
|
|
{
|
|
|
|
if (!StrICmp(t->tempBuffer.data, "&")) {
|
|
StrCpy(t->tempBuffer.data, "\x11");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, "å")) {
|
|
StrCpy(t->tempBuffer.data, "\xc3\x85");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, "•")) {
|
|
StrCpy(t->tempBuffer.data, "\xe2\x80\xa2");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, "©")) {
|
|
StrCpy(t->tempBuffer.data, "\xc2\xa9");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, " ")) {
|
|
StrCpy(t->tempBuffer.data, " ");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, "…")) {
|
|
StrCpy(t->tempBuffer.data, "...");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, "—")) {
|
|
StrCpy(t->tempBuffer.data, "-");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, " ")) {
|
|
StrCpy(t->tempBuffer.data, " ");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, "<")) {
|
|
StrCpy(t->tempBuffer.data, "\x12");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, ">")) {
|
|
StrCpy(t->tempBuffer.data, ">");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, """)) {
|
|
StrCpy(t->tempBuffer.data, "\"");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
if (!StrICmp(t->tempBuffer.data, "&zerowidthspace;")) {
|
|
StrCpy(t->tempBuffer.data, "");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
|
|
StrCpy(t->tempBuffer.data, "?");
|
|
@recalculate_temp_buffer_size(t);
|
|
return;
|
|
}
|
|
|
|
I64 @hex_table_i;
|
|
I64 @hex_table[256];
|
|
MemSet(&@hex_table, NULL, sizeof(I64) * 256);
|
|
|
|
for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) {
|
|
@hex_table[@hex_table_i] = @hex_table_i - '0';
|
|
}
|
|
|
|
for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) {
|
|
@hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A');
|
|
}
|
|
|
|
for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) {
|
|
@hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a');
|
|
}
|
|
|
|
I64 @utf8_encode(U8* out, I64 utf)
|
|
{
|
|
if (utf <= 0x7F) {
|
|
// Plain ASCII
|
|
out[0] = utf;
|
|
out[1] = 0;
|
|
return 1;
|
|
} else if (utf <= 0x07FF) {
|
|
// 2-byte unicode
|
|
out[0] = (((utf >> 6) & 0x1F) | 0xC0);
|
|
out[1] = (((utf >> 0) & 0x3F) | 0x80);
|
|
out[2] = 0;
|
|
return 2;
|
|
} else if (utf <= 0xFFFF) {
|
|
// 3-byte unicode
|
|
out[0] = (((utf >> 12) & 0x0F) | 0xE0);
|
|
out[1] = (((utf >> 6) & 0x3F) | 0x80);
|
|
out[2] = (((utf >> 0) & 0x3F) | 0x80);
|
|
out[3] = 0;
|
|
return 3;
|
|
} else if (utf <= 0x10FFFF) {
|
|
// 4-byte unicode
|
|
out[0] = (((utf >> 18) & 0x07) | 0xF0);
|
|
out[1] = (((utf >> 12) & 0x3F) | 0x80);
|
|
out[2] = (((utf >> 6) & 0x3F) | 0x80);
|
|
out[3] = (((utf >> 0) & 0x3F) | 0x80);
|
|
out[4] = 0;
|
|
return 4;
|
|
} else {
|
|
// error - use replacement character
|
|
out[0] = 0xEF;
|
|
out[1] = 0xBF;
|
|
out[2] = 0xBD;
|
|
out[3] = 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
U0 @replace_temp_buffer_with_dec_character_reference(@html_tokenizer* t)
|
|
{
|
|
t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon
|
|
I64 charCode = Str2I64(t->tempBuffer.data + 2);
|
|
@utf8_encode(t->tempBuffer.data, charCode);
|
|
@recalculate_temp_buffer_size(t);
|
|
}
|
|
|
|
U0 @replace_temp_buffer_with_hex_character_reference(@html_tokenizer* t)
|
|
{
|
|
I64 dec_char = 0;
|
|
|
|
t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon
|
|
|
|
U8* ch = t->tempBuffer.data + 3;
|
|
|
|
while (*ch && dec_char >= 0) {
|
|
dec_char = (dec_char << 4) | @hex_table[*ch++];
|
|
}
|
|
|
|
StrPrint(t->tempBuffer.data, "&#%d;", dec_char);
|
|
@recalculate_temp_buffer_size(t);
|
|
|
|
@replace_temp_buffer_with_dec_character_reference(t);
|
|
}
|
|
|
|
U0 @replace_temp_buffer_with_numeric_character_reference(@html_tokenizer* t)
|
|
{
|
|
switch (t->tempBuffer.data[2]) {
|
|
case 'x':
|
|
@replace_temp_buffer_with_hex_character_reference(t);
|
|
break;
|
|
default:
|
|
@replace_temp_buffer_with_dec_character_reference(t);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @append_char_to_temp_buffer(@html_tokenizer* t, I64 char)
|
|
{
|
|
t->tempBuffer.data[StrLen(t->tempBuffer.data)] = char;
|
|
t->tempBuffer.size++;
|
|
}
|
|
|
|
@html_dom_node* @create_new_node(U8* tagName, CTask* mem_task)
|
|
{
|
|
@html_dom_node* node = CAlloc(sizeof(@html_dom_node), mem_task);
|
|
StrCpy(node->tagName, tagName);
|
|
node->attributes = Json.CreateObject(mem_task);
|
|
node->children = Json.CreateArray(mem_task);
|
|
node->text = @init_growable_string(mem_task);
|
|
node->sig = JSON_SIG;
|
|
node->type = JSON_HTML;
|
|
return node;
|
|
}
|
|
|
|
U0 @init_tokenizer(@html_tokenizer* t, U8* data, I64 size, CTask* mem_task)
|
|
{
|
|
t->mem_task = mem_task;
|
|
t->inputBuffer.data = data;
|
|
t->inputBuffer.size = size;
|
|
t->inputBuffer.pos = 0;
|
|
t->state = HTML_STATE_DATA;
|
|
t->tempBuffer.data = CAlloc(512, t->mem_task);
|
|
t->tempBuffer.size = size;
|
|
t->tempBuffer.pos = 0;
|
|
t->originNode = @create_new_node("Document", t->mem_task);
|
|
t->appendNode = t->originNode;
|
|
t->currentNode = t->originNode;
|
|
t->consumeTempBuffer = FALSE;
|
|
t->dataStateCounter = 0;
|
|
t->numOfImgNodes = 0;
|
|
}
|
|
|
|
U0 @consume_next_input_char(@html_tokenizer* t)
|
|
{
|
|
if (t->consumeTempBuffer) {
|
|
if (t->tempBuffer.pos < t->tempBuffer.size) {
|
|
t->currentInputChar = t->tempBuffer.data[t->tempBuffer.pos++];
|
|
return;
|
|
} else {
|
|
t->consumeTempBuffer = FALSE;
|
|
}
|
|
}
|
|
t->currentInputChar = t->inputBuffer.data[t->inputBuffer.pos++];
|
|
}
|
|
|
|
U0 @emit_current_character(@html_tokenizer* t)
|
|
{
|
|
if (!t->dataStateCounter) {
|
|
@html_dom_node* node = @create_new_node("InternalTextNode", t->mem_task);
|
|
t->currentNode = node;
|
|
}
|
|
t->currentNode->text = @append_char_to_growable_string(t->currentNode->text,
|
|
t->currentInputChar, t->mem_task);
|
|
t->dataStateCounter++;
|
|
}
|
|
|
|
Bool @node_is_self_closing(@html_dom_node* node)
|
|
{
|
|
if (!StrICmp(node->tagName, "InternalTextNode"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "area"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "base"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "br"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "col"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "embed"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "hr"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "img"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "input"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "link"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "meta"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "param"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "source"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "track"))
|
|
return TRUE;
|
|
if (!StrICmp(node->tagName, "wbr"))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
U0 @emit_current_node(@html_tokenizer* t)
|
|
{
|
|
@html_dom_node* origAppendNode = t->appendNode;
|
|
if (t->currentNode->tagName[0] == '/') {
|
|
if (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) {
|
|
/* end tag tagName for currentNode does not match appendNode tagName,
|
|
* traverse up parentNode until we find a match */
|
|
while (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) {
|
|
if (!StrICmp(t->appendNode->tagName,
|
|
"Document")) { // If we've traversed this far up, then the
|
|
// closing tag is invalid
|
|
t->appendNode = origAppendNode;
|
|
return;
|
|
}
|
|
t->appendNode = t->appendNode->parentNode;
|
|
}
|
|
}
|
|
t->appendNode = t->appendNode->parentNode;
|
|
return;
|
|
}
|
|
// JsonItem* nodeItem = CAlloc(sizeof(JsonItem), t->mem_task);
|
|
t->currentNode->parentNode = t->appendNode;
|
|
// nodeItem->value = t->currentNode;
|
|
// Json.AppendItem(t->appendNode->children, nodeItem);
|
|
t->appendNode->children->append(t->currentNode);
|
|
if (!@node_is_self_closing(t->currentNode))
|
|
t->appendNode = t->currentNode;
|
|
}
|
|
|
|
U0 @set_current_attribute_on_current_node(@html_tokenizer* t)
|
|
{
|
|
t->currentNode->attributes->set(t->currentAttribute->name,
|
|
t->currentAttribute->value, JSON_STRING);
|
|
// Json.Set(t->currentNode->attributes, t->currentAttribute->name,
|
|
// t->currentAttribute->value, JSON_STRING, t->mem_task);
|
|
}
|
|
|
|
Bool @skip_script_data(@html_tokenizer* t)
|
|
{
|
|
// FIXME: This will work in most cases, except for when </script> tags are escaped in SCRIPT data.
|
|
U8 cmpbuf[16];
|
|
MemSet(cmpbuf, NULL, 16);
|
|
MemCpy(cmpbuf, t->inputBuffer.data + t->inputBuffer.pos, 6);
|
|
if (!StrICmp(cmpbuf, "script")) {
|
|
t->inputBuffer.pos += 6;
|
|
while (StrICmp(cmpbuf, "</script>")) {
|
|
MemSet(cmpbuf, NULL, 16);
|
|
MemCpy(cmpbuf, t->inputBuffer.data + t->inputBuffer.pos, 9);
|
|
++t->inputBuffer.pos;
|
|
}
|
|
t->inputBuffer.pos += 8;
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
U0 @tokenizer_html_state_data(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '&':
|
|
// Set the return state to the data state. Switch to the character reference
|
|
// state.
|
|
t->returnState = HTML_STATE_DATA;
|
|
t->state = HTML_STATE_CHARACTER_REFERENCE;
|
|
break;
|
|
case '<':
|
|
if (!@skip_script_data(t)) {
|
|
// Switch to the tag open state.
|
|
if (t->dataStateCounter)
|
|
@emit_current_node(t);
|
|
t->dataStateCounter = 0;
|
|
t->state = HTML_STATE_TAG_OPEN;
|
|
}
|
|
break;
|
|
default:
|
|
// Emit the current input character as a character token.
|
|
@emit_current_character(t);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_tag_open(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '!':
|
|
// Switch to the markup declaration open state.
|
|
t->state = HTML_STATE_MARKUP_DECLARATION_OPEN;
|
|
break;
|
|
case '/':
|
|
// Switch to the end tag open state.
|
|
t->state = HTML_STATE_END_TAG_OPEN;
|
|
break;
|
|
case 'A' ... 'Z':
|
|
case 'a' ... 'z':
|
|
// Create a new start tag token, set its tag name to the empty string.
|
|
// Reconsume in the tag name state.
|
|
@html_dom_node* node = @create_new_node("", t->mem_task);
|
|
t->currentNode = node;
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_TAG_NAME;
|
|
break;
|
|
case '?':
|
|
// This is an unexpected-question-mark-instead-of-tag-name parse error.
|
|
// Create a comment token whose data is the empty string. Reconsume in the
|
|
// bogus comment state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_BOGUS_COMMENT;
|
|
break;
|
|
default:
|
|
// This is an invalid-first-character-of-tag-name parse error. Emit a U+003C
|
|
// LESS-THAN SIGN character token. Reconsume in the data state.
|
|
@emit_current_character(t);
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_markup_declaration_open(@html_tokenizer* t)
|
|
{
|
|
if ((t->inputBuffer.data[t->inputBuffer.pos] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 1] == '-')) {
|
|
// Consume those two characters, create a comment token whose data is the
|
|
// empty string, and switch to the comment state.
|
|
t->inputBuffer.pos += 2;
|
|
t->state = HTML_STATE_COMMENT;
|
|
return;
|
|
}
|
|
U8 buf[8];
|
|
buf[7] = NULL;
|
|
MemCpy(buf, t->inputBuffer.data + t->inputBuffer.pos, 7);
|
|
if (!StrICmp(buf, "DOCTYPE")) {
|
|
// Consume those characters and switch to the DOCTYPE state.
|
|
t->inputBuffer.pos += 7;
|
|
t->state = HTML_STATE_DOCTYPE;
|
|
return;
|
|
}
|
|
t->state = HTML_STATE_BOGUS_COMMENT;
|
|
}
|
|
|
|
U0 @tokenizer_html_state_doctype(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Switch to the before DOCTYPE name state.
|
|
t->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
|
|
break;
|
|
case '>':
|
|
// Reconsume in the before DOCTYPE name state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
|
|
break;
|
|
default:
|
|
// This is a missing-whitespace-before-doctype-name parse error. Reconsume
|
|
// in the before DOCTYPE name state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_before_doctype_name(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Ignore the character.
|
|
break;
|
|
case 'A' ... 'Z':
|
|
// Create a new DOCTYPE token. Set the token's name to the lowercase version
|
|
// of the current input character (add 0x0020 to the character's code
|
|
// point). Switch to the DOCTYPE name state.
|
|
t->state = HTML_STATE_DOCTYPE_NAME;
|
|
break;
|
|
case '>':
|
|
// This is a missing-doctype-name parse error. Create a new DOCTYPE token.
|
|
// Set its force-quirks flag to on. Switch to the data state. Emit the
|
|
// current token.
|
|
@emit_current_character(t);
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Create a new DOCTYPE token. Set the token's name to the current input
|
|
// character. Switch to the DOCTYPE name state.
|
|
t->state = HTML_STATE_DOCTYPE_NAME;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_doctype_name(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Switch to the after DOCTYPE name state.
|
|
t->state = HTML_STATE_AFTER_DOCTYPE_NAME;
|
|
break;
|
|
case '>':
|
|
// Switch to the data state. Emit the current DOCTYPE token.
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
case 'A' ... 'Z':
|
|
// Append the lowercase version of the current input character (add 0x0020
|
|
// to the character's code point) to the current DOCTYPE token's name.
|
|
break;
|
|
default:
|
|
// Append the current input character to the current DOCTYPE token's name.
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_after_doctype_name(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Ignore the character.
|
|
break;
|
|
case 'A' ... 'Z':
|
|
// Create a new DOCTYPE token. Set the token's name to the lowercase version
|
|
// of the current input character (add 0x0020 to the character's code
|
|
// point). Switch to the DOCTYPE name state.
|
|
t->state = HTML_STATE_DOCTYPE_NAME;
|
|
break;
|
|
case '>':
|
|
// Switch to the data state. Emit the current DOCTYPE token.
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Reconsume in the bogus DOCTYPE state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_BOGUS_DOCTYPE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_bogus_doctype(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '>':
|
|
// Switch to the data state. Emit the DOCTYPE token.
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Ignore the character.
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_bogus_comment(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '>':
|
|
// Switch to the data state. Emit the DOCTYPE token.
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Ignore the character.
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_tag_name(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Switch to the before attribute name state.
|
|
t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
|
|
break;
|
|
case '/':
|
|
// Switch to the self-closing start tag state.
|
|
t->state = HTML_STATE_SELF_CLOSING_START_TAG;
|
|
break;
|
|
case '>':
|
|
// Switch to the data state. Emit the current tag token.
|
|
@emit_current_node(t);
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
case 'A' ... 'Z':
|
|
// Append the lowercase version of the current input character (add 0x0020
|
|
// to the character's code point) to the current tag token's tag name.
|
|
t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar + 0x20;
|
|
if (!StrICmp(t->currentNode->tagName, "img"))
|
|
t->numOfImgNodes++;
|
|
if (!StrICmp(t->currentNode->tagName, "body")) {
|
|
t->currentNode->backgroundColor = Color(255, 255, 255);
|
|
t->currentNode->color = Color(0, 0, 0);
|
|
}
|
|
break;
|
|
default:
|
|
// Append the current input character to the current tag token's tag name.
|
|
t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar;
|
|
if (!StrICmp(t->currentNode->tagName, "img"))
|
|
t->numOfImgNodes++;
|
|
if (!StrICmp(t->currentNode->tagName, "body")) {
|
|
t->currentNode->backgroundColor = Color(255, 255, 255);
|
|
t->currentNode->color = Color(0, 0, 0);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_before_attribute_name(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Ignore the character.
|
|
break;
|
|
case '/':
|
|
case '>':
|
|
// Reconsume in the after attribute name state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
|
|
break;
|
|
case '=':
|
|
// This is an unexpected-equals-sign-before-attribute-name parse error.
|
|
// Start a new attribute in the current tag token. Set that attribute's name
|
|
// to the current input character, and its value to the empty string. Switch
|
|
// to the attribute name state.
|
|
t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task);
|
|
t->currentAttribute->name = @init_growable_string(t->mem_task);
|
|
t->currentAttribute->value = @init_growable_string(t->mem_task);
|
|
t->currentAttribute->name = @append_char_to_growable_string(
|
|
t->currentAttribute->name, t->currentInputChar, t->mem_task);
|
|
t->state = HTML_STATE_ATTRIBUTE_NAME;
|
|
break;
|
|
default:
|
|
// Start a new attribute in the current tag token. Set that attribute name
|
|
// and value to the empty string. Reconsume in the attribute name state.
|
|
t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task);
|
|
t->currentAttribute->name = @init_growable_string(t->mem_task);
|
|
t->currentAttribute->value = @init_growable_string(t->mem_task);
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_ATTRIBUTE_NAME;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_attribute_name(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
case '/':
|
|
case '>':
|
|
// Reconsume in the after attribute name state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
|
|
break;
|
|
case '=':
|
|
// Switch to the before attribute value state.
|
|
t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
|
|
break;
|
|
case 'A' ... 'Z':
|
|
// Append the lowercase version of the current input character (add 0x0020
|
|
// to the character's code point) to the current attribute's name.
|
|
t->currentAttribute->name = @append_char_to_growable_string(
|
|
t->currentAttribute->name, t->currentInputChar + 0x20, t->mem_task);
|
|
break;
|
|
case '"':
|
|
case '\'':
|
|
case '<':
|
|
// This is an unexpected-character-in-attribute-name parse error. Treat it as
|
|
// per the "anything else" entry below.
|
|
default:
|
|
// Append the current input character to the current attribute's name.
|
|
t->currentAttribute->name = @append_char_to_growable_string(
|
|
t->currentAttribute->name, t->currentInputChar, t->mem_task);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_before_attribute_value(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Ignore the character.
|
|
break;
|
|
case '"':
|
|
// Switch to the attribute value (double-quoted) state.
|
|
t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
|
|
break;
|
|
case '\'':
|
|
// Switch to the attribute value (single-quoted) state.
|
|
t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
|
|
break;
|
|
case '>':
|
|
// This is a missing-attribute-value parse error. Switch to the data state.
|
|
// Emit the current tag token.
|
|
@emit_current_node(t);
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Reconsume in the attribute value (unquoted) state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_attribute_value_double_quoted(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '"':
|
|
// Switch to the after attribute value (quoted) state.
|
|
@set_current_attribute_on_current_node(t);
|
|
t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
|
|
break;
|
|
/*
|
|
case '&':
|
|
// Set the return state to the attribute value (double-quoted) state. Switch
|
|
// to the character reference state.
|
|
t->returnState = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
|
|
t->state = HTML_STATE_CHARACTER_REFERENCE;
|
|
break;
|
|
*/
|
|
default:
|
|
// Append the current input character to the current attribute's value.
|
|
t->currentAttribute->value = @append_char_to_growable_string(
|
|
t->currentAttribute->value, t->currentInputChar, t->mem_task);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_attribute_value_unquoted(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Switch to the before attribute name state.
|
|
t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
|
|
break;
|
|
/*
|
|
case '&':
|
|
// Set the return state to the attribute value (double-quoted) state. Switch
|
|
// to the character reference state.
|
|
t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
|
|
t->state = HTML_STATE_CHARACTER_REFERENCE;
|
|
break;
|
|
*/
|
|
case '>':
|
|
// Switch to the data state. Emit the current tag token.
|
|
@emit_current_node(t);
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Append the current input character to the current attribute's value.
|
|
t->currentAttribute->value = @append_char_to_growable_string(
|
|
t->currentAttribute->value, t->currentInputChar, t->mem_task);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_attribute_value_single_quoted(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\'':
|
|
// Switch to the after attribute value (quoted) state.
|
|
@set_current_attribute_on_current_node(t);
|
|
t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
|
|
break;
|
|
/*
|
|
case '&':
|
|
// Set the return state to the attribute value (double-quoted) state. Switch
|
|
// to the character reference state.
|
|
t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
|
|
t->state = HTML_STATE_CHARACTER_REFERENCE;
|
|
break;
|
|
*/
|
|
default:
|
|
// Append the current input character to the current attribute's value.
|
|
t->currentAttribute->value = @append_char_to_growable_string(
|
|
t->currentAttribute->value, t->currentInputChar, t->mem_task);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_after_attribute_value_quoted(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Switch to the before attribute name state.
|
|
t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
|
|
break;
|
|
case '/':
|
|
// Switch to the self-closing start tag state.
|
|
t->state = HTML_STATE_SELF_CLOSING_START_TAG;
|
|
break;
|
|
case '>':
|
|
// Switch to the data state. Emit the current tag token.
|
|
@emit_current_node(t);
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// This is a missing-whitespace-between-attributes parse error. Reconsume in
|
|
// the before attribute name state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_end_tag_open(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case 'A' ... 'Z':
|
|
case 'a' ... 'z':
|
|
// Create a new end tag token, set its tag name to the empty string.
|
|
// Reconsume in the tag name state.
|
|
@html_dom_node* node = @create_new_node("/", t->mem_task);
|
|
t->currentNode = node;
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_TAG_NAME;
|
|
break;
|
|
case '>':
|
|
// This is a missing-end-tag-name parse error. Switch to the data state.
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// This is an invalid-first-character-of-tag-name parse error. Create a
|
|
// comment token whose data is the empty string. Reconsume in the bogus
|
|
// comment state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_BOGUS_COMMENT;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_after_attribute_name(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case ' ':
|
|
// Ignore the character.
|
|
break;
|
|
/*
|
|
case '"':
|
|
// Switch to the attribute value (double-quoted) state.
|
|
t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
|
|
break;
|
|
case '\'':
|
|
// Switch to the attribute value (single-quoted) state.
|
|
t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
|
|
break;
|
|
*/
|
|
case '/':
|
|
// Switch to the self-closing start tag state.
|
|
@set_current_attribute_on_current_node(t);
|
|
t->state = HTML_STATE_SELF_CLOSING_START_TAG;
|
|
break;
|
|
case '=':
|
|
// Switch to the before attribute value state.
|
|
t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
|
|
break;
|
|
case '>':
|
|
// Switch to the data state. Emit the current tag token.
|
|
@set_current_attribute_on_current_node(t);
|
|
@emit_current_node(t);
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Start a new attribute in the current tag token. Set that attribute name
|
|
// and value to the empty string. Reconsume in the attribute name state.
|
|
t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task);
|
|
t->currentAttribute->name = @init_growable_string(t->mem_task);
|
|
t->currentAttribute->value = @init_growable_string(t->mem_task);
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_ATTRIBUTE_NAME;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_self_closing_start_tag(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '>':
|
|
// Set the self-closing flag of the current tag token. Switch to the data
|
|
// state. Emit the current tag token.
|
|
@emit_current_node(t);
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// This is an unexpected-solidus-in-tag parse error. Reconsume in the before
|
|
// attribute name state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_character_reference(@html_tokenizer* t)
|
|
{
|
|
// Set the temporary buffer to the empty string.
|
|
@empty_temp_buffer(t);
|
|
// Append a U+0026 AMPERSAND (&) character to the temporary buffer.
|
|
@append_char_to_temp_buffer(t, '&');
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case 'A' ... 'Z':
|
|
case 'a' ... 'z':
|
|
// Reconsume in the named character reference state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_NAMED_CHARACTER_REFERENCE;
|
|
break;
|
|
case '#':
|
|
// Append the current input character to the temporary buffer. Switch to the
|
|
// numeric character reference state.
|
|
@append_char_to_temp_buffer(t, '#');
|
|
t->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE;
|
|
break;
|
|
default:
|
|
// Flush code points consumed as a character reference. Reconsume in the
|
|
// return state.
|
|
t->consumeTempBuffer = TRUE;
|
|
t->state = t->returnState;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_named_character_reference(@html_tokenizer* t)
|
|
{
|
|
// Consume the maximum number of characters possible, where the consumed
|
|
// characters are one of the identifiers in the first column of the named
|
|
// character references table. Append each character to the temporary buffer
|
|
// when it's consumed.
|
|
@consume_next_input_char(t);
|
|
@append_char_to_temp_buffer(t, t->currentInputChar);
|
|
switch (t->currentInputChar) {
|
|
case ';':
|
|
@replace_temp_buffer_with_named_character_reference(t);
|
|
t->consumeTempBuffer = TRUE;
|
|
t->state = t->returnState;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_numeric_character_reference(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
@append_char_to_temp_buffer(t, t->currentInputChar);
|
|
switch (t->currentInputChar) {
|
|
case ';':
|
|
@replace_temp_buffer_with_numeric_character_reference(t);
|
|
t->consumeTempBuffer = TRUE;
|
|
t->state = t->returnState;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_comment_start(@html_tokenizer* t)
|
|
{
|
|
@consume_next_input_char(t);
|
|
switch (t->currentInputChar) {
|
|
case '-':
|
|
// Switch to the comment start dash state.
|
|
t->state = HTML_STATE_COMMENT_START_DASH;
|
|
break;
|
|
case '>':
|
|
// This is an abrupt-closing-of-empty-comment parse error. Switch to the
|
|
// data state. Emit the current comment token.
|
|
t->state = HTML_STATE_DATA;
|
|
break;
|
|
default:
|
|
// Reconsume in the comment state.
|
|
t->inputBuffer.pos--;
|
|
t->state = HTML_STATE_COMMENT;
|
|
break;
|
|
}
|
|
}
|
|
|
|
U0 @tokenizer_html_state_comment(@html_tokenizer* t)
|
|
{
|
|
if ((t->inputBuffer.data[t->inputBuffer.pos] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 1] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 2] == '>')) {
|
|
// Consume those three characters, and switch to the data state.
|
|
t->inputBuffer.pos += 3;
|
|
t->state = HTML_STATE_DATA;
|
|
return;
|
|
}
|
|
@consume_next_input_char(t);
|
|
}
|
|
|
|
U0 @dump_node(@html_tokenizer* t, @html_dom_node* node)
|
|
{
|
|
|
|
I64 i;
|
|
|
|
if (StrICmp(node->tagName, "InternalTextNode") && StrICmp(node->tagName, "Document")) {
|
|
for (i = 0; i < t->nodeTreeDepth; i++)
|
|
"-";
|
|
"<%s> : parentNode: <%s 0x%08x>\n", node->tagName,
|
|
node->parentNode->tagName, node->parentNode;
|
|
}
|
|
|
|
if (node->children->length) {
|
|
t->nodeTreeDepth += 2;
|
|
for (i = 0; i < node->children->length; i++) {
|
|
@dump_node(t, node->children->@(i));
|
|
//@dump_node(t, Json.ArrayIndex(node->children, i));
|
|
}
|
|
t->nodeTreeDepth -= 2;
|
|
}
|
|
}
|
|
|
|
U0 @dump_node_list(@html_tokenizer* t)
|
|
{
|
|
t->nodeTreeDepth = -2;
|
|
@dump_node(t, t->originNode);
|
|
"\n";
|
|
}
|
|
|
|
@html_dom_node* @html_tokenize_and_create_node_list(U8* buffer, I64 size, CTask* mem_task,
|
|
I64* num_of_images)
|
|
{
|
|
@html_tokenizer t;
|
|
@init_tokenizer(&t, buffer, size, mem_task);
|
|
while (t.inputBuffer.pos < t.inputBuffer.size && buffer[t.inputBuffer.pos]) {
|
|
switch (t.state) {
|
|
case HTML_STATE_DATA:
|
|
@tokenizer_html_state_data(&t);
|
|
break;
|
|
case HTML_STATE_TAG_OPEN:
|
|
@tokenizer_html_state_tag_open(&t);
|
|
break;
|
|
case HTML_STATE_MARKUP_DECLARATION_OPEN:
|
|
@tokenizer_html_state_markup_declaration_open(&t);
|
|
break;
|
|
case HTML_STATE_DOCTYPE:
|
|
@tokenizer_html_state_doctype(&t);
|
|
break;
|
|
case HTML_STATE_BEFORE_DOCTYPE_NAME:
|
|
@tokenizer_html_state_before_doctype_name(&t);
|
|
break;
|
|
case HTML_STATE_DOCTYPE_NAME:
|
|
@tokenizer_html_state_doctype_name(&t);
|
|
break;
|
|
case HTML_STATE_TAG_NAME:
|
|
@tokenizer_html_state_tag_name(&t);
|
|
break;
|
|
case HTML_STATE_BEFORE_ATTRIBUTE_NAME:
|
|
@tokenizer_html_state_before_attribute_name(&t);
|
|
break;
|
|
case HTML_STATE_ATTRIBUTE_NAME:
|
|
@tokenizer_html_state_attribute_name(&t);
|
|
break;
|
|
case HTML_STATE_BEFORE_ATTRIBUTE_VALUE:
|
|
@tokenizer_html_state_before_attribute_value(&t);
|
|
break;
|
|
case HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED:
|
|
@tokenizer_html_state_attribute_value_double_quoted(&t);
|
|
break;
|
|
case HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED:
|
|
@tokenizer_html_state_after_attribute_value_quoted(&t);
|
|
break;
|
|
case HTML_STATE_CHARACTER_REFERENCE:
|
|
@tokenizer_html_state_character_reference(&t);
|
|
break;
|
|
case HTML_STATE_END_TAG_OPEN:
|
|
@tokenizer_html_state_end_tag_open(&t);
|
|
break;
|
|
case HTML_STATE_AFTER_ATTRIBUTE_NAME:
|
|
@tokenizer_html_state_after_attribute_name(&t);
|
|
break;
|
|
case HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED:
|
|
@tokenizer_html_state_attribute_value_single_quoted(&t);
|
|
break;
|
|
case HTML_STATE_NAMED_CHARACTER_REFERENCE:
|
|
@tokenizer_html_state_named_character_reference(&t);
|
|
break;
|
|
case HTML_STATE_NUMERIC_CHARACTER_REFERENCE:
|
|
@tokenizer_html_state_numeric_character_reference(&t);
|
|
break;
|
|
case HTML_STATE_AFTER_DOCTYPE_NAME:
|
|
@tokenizer_html_state_after_doctype_name(&t);
|
|
break;
|
|
case HTML_STATE_BOGUS_DOCTYPE:
|
|
@tokenizer_html_state_bogus_doctype(&t);
|
|
break;
|
|
case HTML_STATE_SELF_CLOSING_START_TAG:
|
|
@tokenizer_html_state_self_closing_start_tag(&t);
|
|
break;
|
|
case HTML_STATE_BOGUS_COMMENT:
|
|
@tokenizer_html_state_bogus_comment(&t);
|
|
break;
|
|
case HTML_STATE_COMMENT_START:
|
|
@tokenizer_html_state_comment_start(&t);
|
|
break;
|
|
case HTML_STATE_COMMENT:
|
|
@tokenizer_html_state_comment(&t);
|
|
break;
|
|
case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED:
|
|
@tokenizer_html_state_attribute_value_unquoted(&t);
|
|
break;
|
|
case HTML_STATE_INVALID:
|
|
default:
|
|
"\n$FG,0$HTML Tokenization error: Invalid or unimplemented "
|
|
"state\nInputBuffer position: %d\nState: %d$FD$\n\n",
|
|
t.inputBuffer.pos, t.state;
|
|
PressAKey;
|
|
break;
|
|
}
|
|
}
|
|
@html_dom_node* node_list = t.originNode;
|
|
*num_of_images = t.numOfImgNodes;
|
|
return node_list;
|
|
}
|