Spaces:
Running
on
T4
Running
on
T4
/** | |
* The Lexer class handles tokenizing the input in various ways. Since our | |
* parser expects us to be able to backtrack, the lexer allows lexing from any | |
* given starting point. | |
* | |
* Its main exposed function is the `lex` function, which takes a position to | |
* lex from and a type of token to lex. It defers to the appropriate `_innerLex` | |
* function. | |
* | |
* The various `_innerLex` functions perform the actual lexing of different | |
* kinds. | |
*/ | |
var matchAt = require("../../match-at"); | |
var ParseError = require("./ParseError"); | |
// The main lexer class | |
function Lexer(input) { | |
this._input = input; | |
} | |
// The resulting token returned from `lex`. | |
function Token(text, data, position) { | |
this.text = text; | |
this.data = data; | |
this.position = position; | |
} | |
/* The following tokenRegex | |
* - matches typical whitespace (but not NBSP etc.) using its first group | |
* - matches symbol combinations which result in a single output character | |
* - does not match any control character \x00-\x1f except whitespace | |
* - does not match a bare backslash | |
* - matches any ASCII character except those just mentioned | |
* - does not match the BMP private use area \uE000-\uF8FF | |
* - does not match bare surrogate code units | |
* - matches any BMP character except for those just described | |
* - matches any valid Unicode surrogate pair | |
* - matches a backslash followed by one or more letters | |
* - matches a backslash followed by any BMP character, including newline | |
* Just because the Lexer matches something doesn't mean it's valid input: | |
* If there is no matching function or symbol definition, the Parser will | |
* still reject the input. | |
*/ | |
var tokenRegex = new RegExp( | |
"([ \r\n\t]+)|(" + // whitespace | |
"---?" + // special combinations | |
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint | |
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair | |
"|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name | |
")" | |
); | |
var whitespaceRegex = /\s*/; | |
/** | |
* This function lexes a single normal token. It takes a position and | |
* whether it should completely ignore whitespace or not. | |
*/ | |
Lexer.prototype._innerLex = function(pos, ignoreWhitespace) { | |
var input = this._input; | |
if (pos === input.length) { | |
return new Token("EOF", null, pos); | |
} | |
var match = matchAt(tokenRegex, input, pos); | |
if (match === null) { | |
throw new ParseError( | |
"Unexpected character: '" + input[pos] + "'", | |
this, pos); | |
} else if (match[2]) { // matched non-whitespace | |
return new Token(match[2], null, pos + match[2].length); | |
} else if (ignoreWhitespace) { | |
return this._innerLex(pos + match[1].length, true); | |
} else { // concatenate whitespace to a single space | |
return new Token(" ", null, pos + match[1].length); | |
} | |
}; | |
// A regex to match a CSS color (like #ffffff or BlueViolet) | |
var cssColor = /#[a-z0-9]+|[a-z]+/i; | |
/** | |
* This function lexes a CSS color. | |
*/ | |
Lexer.prototype._innerLexColor = function(pos) { | |
var input = this._input; | |
// Ignore whitespace | |
var whitespace = matchAt(whitespaceRegex, input, pos)[0]; | |
pos += whitespace.length; | |
var match; | |
if ((match = matchAt(cssColor, input, pos))) { | |
// If we look like a color, return a color | |
return new Token(match[0], null, pos + match[0].length); | |
} else { | |
throw new ParseError("Invalid color", this, pos); | |
} | |
}; | |
// A regex to match a dimension. Dimensions look like | |
// "1.2em" or ".4pt" or "1 ex" | |
var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/; | |
/** | |
* This function lexes a dimension. | |
*/ | |
Lexer.prototype._innerLexSize = function(pos) { | |
var input = this._input; | |
// Ignore whitespace | |
var whitespace = matchAt(whitespaceRegex, input, pos)[0]; | |
pos += whitespace.length; | |
var match; | |
if ((match = matchAt(sizeRegex, input, pos))) { | |
var unit = match[3]; | |
// We only currently handle "em" and "ex" units | |
// if (unit !== "em" && unit !== "ex") { | |
// throw new ParseError("Invalid unit: '" + unit + "'", this, pos); | |
// } | |
return new Token(match[0], { | |
number: +(match[1] + match[2]), | |
unit: unit, | |
}, pos + match[0].length); | |
} | |
throw new ParseError("Invalid size", this, pos); | |
}; | |
/** | |
* This function lexes a string of whitespace. | |
*/ | |
Lexer.prototype._innerLexWhitespace = function(pos) { | |
var input = this._input; | |
var whitespace = matchAt(whitespaceRegex, input, pos)[0]; | |
pos += whitespace.length; | |
return new Token(whitespace[0], null, pos); | |
}; | |
/** | |
* This function lexes a single token starting at `pos` and of the given mode. | |
* Based on the mode, we defer to one of the `_innerLex` functions. | |
*/ | |
Lexer.prototype.lex = function(pos, mode) { | |
if (mode === "math") { | |
return this._innerLex(pos, true); | |
} else if (mode === "text") { | |
return this._innerLex(pos, false); | |
} else if (mode === "color") { | |
return this._innerLexColor(pos); | |
} else if (mode === "size") { | |
return this._innerLexSize(pos); | |
} else if (mode === "whitespace") { | |
return this._innerLexWhitespace(pos); | |
} | |
}; | |
module.exports = Lexer; | |