/** * The Lexer class handles tokenizing the input in various ways. Since our * parser expects us to be able to backtrack, the lexer allows lexing from any * given starting point. * * Its main exposed function is the `lex` function, which takes a position to * lex from and a type of token to lex. It defers to the appropriate `_innerLex` * function. * * The various `_innerLex` functions perform the actual lexing of different * kinds. */ var matchAt = require("../../match-at"); var ParseError = require("./ParseError"); // The main lexer class function Lexer(input) { this._input = input; } // The resulting token returned from `lex`. function Token(text, data, position) { this.text = text; this.data = data; this.position = position; } /* The following tokenRegex * - matches typical whitespace (but not NBSP etc.) using its first group * - matches symbol combinations which result in a single output character * - does not match any control character \x00-\x1f except whitespace * - does not match a bare backslash * - matches any ASCII character except those just mentioned * - does not match the BMP private use area \uE000-\uF8FF * - does not match bare surrogate code units * - matches any BMP character except for those just described * - matches any valid Unicode surrogate pair * - matches a backslash followed by one or more letters * - matches a backslash followed by any BMP character, including newline * Just because the Lexer matches something doesn't mean it's valid input: * If there is no matching function or symbol definition, the Parser will * still reject the input. */ var tokenRegex = new RegExp( "([ \r\n\t]+)|(" + // whitespace "---?" + // special combinations "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name ")" ); var whitespaceRegex = /\s*/; /** * This function lexes a single normal token. It takes a position and * whether it should completely ignore whitespace or not. */ Lexer.prototype._innerLex = function(pos, ignoreWhitespace) { var input = this._input; if (pos === input.length) { return new Token("EOF", null, pos); } var match = matchAt(tokenRegex, input, pos); if (match === null) { throw new ParseError( "Unexpected character: '" + input[pos] + "'", this, pos); } else if (match[2]) { // matched non-whitespace return new Token(match[2], null, pos + match[2].length); } else if (ignoreWhitespace) { return this._innerLex(pos + match[1].length, true); } else { // concatenate whitespace to a single space return new Token(" ", null, pos + match[1].length); } }; // A regex to match a CSS color (like #ffffff or BlueViolet) var cssColor = /#[a-z0-9]+|[a-z]+/i; /** * This function lexes a CSS color. */ Lexer.prototype._innerLexColor = function(pos) { var input = this._input; // Ignore whitespace var whitespace = matchAt(whitespaceRegex, input, pos)[0]; pos += whitespace.length; var match; if ((match = matchAt(cssColor, input, pos))) { // If we look like a color, return a color return new Token(match[0], null, pos + match[0].length); } else { throw new ParseError("Invalid color", this, pos); } }; // A regex to match a dimension. Dimensions look like // "1.2em" or ".4pt" or "1 ex" var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/; /** * This function lexes a dimension. */ Lexer.prototype._innerLexSize = function(pos) { var input = this._input; // Ignore whitespace var whitespace = matchAt(whitespaceRegex, input, pos)[0]; pos += whitespace.length; var match; if ((match = matchAt(sizeRegex, input, pos))) { var unit = match[3]; // We only currently handle "em" and "ex" units // if (unit !== "em" && unit !== "ex") { // throw new ParseError("Invalid unit: '" + unit + "'", this, pos); // } return new Token(match[0], { number: +(match[1] + match[2]), unit: unit, }, pos + match[0].length); } throw new ParseError("Invalid size", this, pos); }; /** * This function lexes a string of whitespace. */ Lexer.prototype._innerLexWhitespace = function(pos) { var input = this._input; var whitespace = matchAt(whitespaceRegex, input, pos)[0]; pos += whitespace.length; return new Token(whitespace[0], null, pos); }; /** * This function lexes a single token starting at `pos` and of the given mode. * Based on the mode, we defer to one of the `_innerLex` functions. */ Lexer.prototype.lex = function(pos, mode) { if (mode === "math") { return this._innerLex(pos, true); } else if (mode === "text") { return this._innerLex(pos, false); } else if (mode === "color") { return this._innerLexColor(pos); } else if (mode === "size") { return this._innerLexSize(pos); } else if (mode === "whitespace") { return this._innerLexWhitespace(pos); } }; module.exports = Lexer;