latex2im / katex /src /Parser.js
/* eslint no-constant-condition:0 */
var functions = require("./functions");
var environments = require("./environments");
var Lexer = require("./Lexer");
var symbols = require("./symbols");
var utils = require("./utils");
var parseData = require("./parseData");
var ParseError = require("./ParseError");
global_str = ""
* This file contains the parser used to parse out a TeX expression from the
* input. Since TeX isn't context-free, standard parsers don't work particularly
* well.
* The strategy of this parser is as such:
* The main functions (the `.parse...` ones) take a position in the current
* parse string to parse tokens from. The lexer (found in Lexer.js, stored at
* this.lexer) also supports pulling out tokens at arbitrary places. When
* individual tokens are needed at a position, the lexer is called to pull out a
* token, which is then used.
* The parser has a property called "mode" indicating the mode that
* the parser is currently in. Currently it has to be one of "math" or
* "text", which denotes whether the current environment is a math-y
* one or a text-y one (e.g. inside \text). Currently, this serves to
* limit the functions which can be used in text mode.
* The main functions then return an object which contains the useful data that
* was parsed at its given point, and a new position at the end of the parsed
* data. The main functions can call each other and continue the parsing by
* using the returned position as a new starting point.
* There are also extra `.handle...` functions, which pull out some reused
* functionality into self-contained functions.
* The earlier functions return ParseNodes.
* The later functions (which are called deeper in the parse) sometimes return
* ParseFuncOrArgument, which contain a ParseNode as well as some data about
* whether the parsed object is a function which is missing some arguments, or a
* standalone object which can be used as an argument to another function.
* Main Parser class
function Parser(input, settings) {
// Make a new lexer
this.lexer = new Lexer(input);
// Store the settings for use in parsing
this.settings = settings;
var ParseNode = parseData.ParseNode;
* An initial function (without its arguments), or an argument to a function.
* The `result` argument should be a ParseNode.
function ParseFuncOrArgument(result, isFunction) {
this.result = result;
// Is this a function (i.e. is it something defined in functions.js)?
this.isFunction = isFunction;
* Checks a result to make sure it has the right type, and throws an
* appropriate error otherwise.
* @param {boolean=} consume whether to consume the expected token,
* defaults to true
Parser.prototype.expect = function(text, consume) {
if (this.nextToken.text !== text) {
throw new ParseError(
"Expected '" + text + "', got '" + this.nextToken.text + "'",
this.lexer, this.nextToken.position
if (consume !== false) {
* Considers the current look ahead token as consumed,
* and fetches the one after that as the new look ahead.
Parser.prototype.consume = function() {
this.pos = this.nextToken.position;
global_str = global_str + " " + this.nextToken.text
this.nextToken = this.lexer.lex(this.pos, this.mode);
* Main parsing function, which parses an entire input.
* @return {?Array.<ParseNode>}
Parser.prototype.parse = function() {
// Try to parse the input
this.mode = "math";
this.pos = 0;
this.nextToken = this.lexer.lex(this.pos, this.mode);
var parse = this.parseInput();
return parse;
* Parses an entire input tree.
Parser.prototype.parseInput = function() {
// Parse an expression
var expression = this.parseExpression(false);
// If we succeeded, make sure there's an EOF at the end
this.expect("EOF", false);
return expression;
var endOfExpression = ["}", "\\end", "\\right", "&", "\\\\", "\\cr"];
* Parses an "expression", which is a list of atoms.
* @param {boolean} breakOnInfix Should the parsing stop when we hit infix
* nodes? This happens when functions have higher precendence
* than infix nodes in implicit parses.
* @param {?string} breakOnToken The token that the expression should end with,
* or `null` if something else should end the expression.
* @return {ParseNode}
Parser.prototype.parseExpression = function(breakOnInfix, breakOnToken) {
var body = [];
// Keep adding atoms to the body until we can't parse any more atoms (either
// we reached the end, a }, or a \right)
while (true) {
var lex = this.nextToken;
var pos = this.pos;
if (endOfExpression.indexOf(lex.text) !== -1) {
if (breakOnToken && lex.text === breakOnToken) {
var atom = this.parseAtom();
if (!atom) {
if (!this.settings.throwOnError && lex.text[0] === "\\") {
var errorNode = this.handleUnsupportedCmd();
pos = lex.position;
if (breakOnInfix && atom.type === "infix") {
// rewind so we can parse the infix atom again
this.pos = pos;
this.nextToken = lex;
return this.handleInfixNodes(body);
* Rewrites infix operators such as \over with corresponding commands such
* as \frac.
* There can only be one infix operator per group. If there's more than one
* then the expression is ambiguous. This can be resolved by adding {}.
* @returns {Array}
Parser.prototype.handleInfixNodes = function(body) {
var overIndex = -1;
var funcName;
for (var i = 0; i < body.length; i++) {
var node = body[i];
if (node.type === "infix") {
if (overIndex !== -1) {
throw new ParseError("only one infix operator per group",
this.lexer, -1);
overIndex = i;
funcName = node.value.replaceWith;
if (overIndex !== -1) {
var numerNode;
var denomNode;
var numerBody = body.slice(0, overIndex);
var denomBody = body.slice(overIndex + 1);
if (numerBody.length === 1 && numerBody[0].type === "ordgroup") {
numerNode = numerBody[0];
} else {
numerNode = new ParseNode("ordgroup", numerBody, this.mode);
if (denomBody.length === 1 && denomBody[0].type === "ordgroup") {
denomNode = denomBody[0];
} else {
denomNode = new ParseNode("ordgroup", denomBody, this.mode);
var value = this.callFunction(
funcName, [numerNode, denomNode], null);
return [new ParseNode(value.type, value, this.mode)];
} else {
return body;
// The greediness of a superscript or subscript
* Handle a subscript or superscript with nice errors.
Parser.prototype.handleSupSubscript = function(name) {
var symbol = this.nextToken.text;
var symPos = this.pos;
var group = this.parseGroup();
if (!group) {
if (!this.settings.throwOnError && this.nextToken.text[0] === "\\") {
return this.handleUnsupportedCmd();
} else {
// throw new ParseError(
// "Expected group after '" + symbol + "'",
// this.lexer,
// symPos + 1
// );
} else if (group.isFunction) {
// ^ and _ have a greediness, so handle interactions with functions'
// greediness
var funcGreediness = functions[group.result].greediness;
if (funcGreediness > SUPSUB_GREEDINESS) {
return this.parseFunction(group);
} else {
throw new ParseError(
"Got function '" + group.result + "' with no arguments " +
"as " + name,
this.lexer, symPos + 1);
} else {
return group.result;
* Converts the textual input of an unsupported command into a text node
* contained within a color node whose color is determined by errorColor
Parser.prototype.handleUnsupportedCmd = function() {
var text = this.nextToken.text;
var textordArray = [];
for (var i = 0; i < text.length; i++) {
textordArray.push(new ParseNode("textord", text[i], "text"));
var textNode = new ParseNode(
body: textordArray,
type: "text",
var colorNode = new ParseNode(
color: this.settings.errorColor,
value: [textNode],
type: "color",
return colorNode;
* Parses a group with optional super/subscripts.
* @return {?ParseNode}
Parser.prototype.parseAtom = function() {
// The body of an atom is an implicit group, so that things like
// \left(x\right)^2 work correctly.
var base = this.parseImplicitGroup();
// In text mode, we don't have superscripts or subscripts
if (this.mode === "text") {
return base;
// Note that base may be empty (i.e. null) at this point.
var superscript;
var subscript;
while (true) {
// Lex the first token
var lex = this.nextToken;
if (lex.text === "\\limits" || lex.text === "\\nolimits") {
// We got a limit control
if (!base || base.type !== "op") {
throw new ParseError(
"Limit controls must follow a math operator",
this.lexer, this.pos);
} else {
var limits = lex.text === "\\limits";
base.value.limits = limits;
base.value.alwaysHandleSupSub = true;
} else if (lex.text === "^") {
// We got a superscript start
// if (superscript) {
// throw new ParseError(
// "Double superscript", this.lexer, this.pos);
// }
superscript = this.handleSupSubscript("superscript");
} else if (lex.text === "_") {
// We got a subscript start
// if (subscript) {
// throw new ParseError(
// "Double subscript", this.lexer, this.pos);
// }
subscript = this.handleSupSubscript("subscript");
} else if (lex.text === "'") {
// We got a prime
var prime = new ParseNode("textord", "\\prime", this.mode);
// Many primes can be grouped together, so we handle this here
var primes = [prime];
// Keep lexing tokens until we get something that's not a prime
while (this.nextToken.text === "'") {
// For each one, add another prime to the list
// Put them into an ordgroup as the superscript
superscript = new ParseNode("ordgroup", primes, this.mode);
} else {
// If it wasn't ^, _, or ', stop parsing super/subscripts
if (superscript || subscript) {
// If we got either a superscript or subscript, create a supsub
return new ParseNode("supsub", {
base: base,
sup: superscript,
sub: subscript,
}, this.mode);
} else {
// Otherwise return the original body
return base;
// A list of the size-changing functions, for use in parseImplicitGroup
var sizeFuncs = [
"\\tiny", "\\scriptsize", "\\footnotesize", "\\small", "\\normalsize",
"\\large", "\\Large", "\\LARGE", "\\huge", "\\Huge", "\\textrm", "\\rm", "\\cal",
"\\bf", "\\siptstyle", "\\boldmath", "\\it"
// A list of the style-changing functions, for use in parseImplicitGroup
var styleFuncs = [
"\\displaystyle", "\\textstyle", "\\scriptstyle", "\\scriptscriptstyle",
* Parses an implicit group, which is a group that starts at the end of a
* specified, and ends right before a higher explicit group ends, or at EOL. It
* is used for functions that appear to affect the current style, like \Large or
* \textrm, where instead of keeping a style we just pretend that there is an
* implicit grouping after it until the end of the group. E.g.
* small text {\Large large text} small text again
* It is also used for \left and \right to get the correct grouping.
* @return {?ParseNode}
Parser.prototype.parseImplicitGroup = function() {
var start = this.parseSymbol();
if (start == null) {
// If we didn't get anything we handle, fall back to parseFunction
return this.parseFunction();
var func = start.result;
var body;
if (func === "\\left") {
// If we see a left:
// Parse the entire left function (including the delimiter)
var left = this.parseFunction(start);
// Parse out the implicit body
body = this.parseExpression(false);
// Check the next token
this.expect("\\right", false);
var right = this.parseFunction();
return new ParseNode("leftright", {
body: body,
left: left.value.value,
right: right.value.value,
}, this.mode);
} else if (func === "\\begin") {
// begin...end is similar to left...right
var begin = this.parseFunction(start);
var envName =;
var name = ( + "")
global_str = global_str.substring(0, global_str.length - (name.length * 2 + 2)) + name + "}"
if (!environments.hasOwnProperty(envName)) {
throw new ParseError(
"No such environment: " + envName,
this.lexer, begin.value.namepos);
// Build the environment object. Arguments and other information will
// be made available to the begin and end methods using properties.
var env = environments[envName];
var args = this.parseArguments("\\begin{" + envName + "}", env);
var context = {
mode: this.mode,
envName: envName,
parser: this,
lexer: this.lexer,
positions: args.pop(),
var result = env.handler(context, args);
this.expect("\\end", false);
var end = this.parseFunction();
var name = ( + "")
global_str = global_str.substring(0, global_str.length - (name.length * 2 + 2)) + name + "}"
if ( !== envName) {
throw new ParseError(
"Mismatch: \\begin{" + envName + "} matched " +
"by \\end{" + + "}",
this.lexer /* , end.value.namepos */);
// TODO: Add position to the above line and adjust test case,
// requires #385 to get merged first
result.position = end.position;
return result;
} else if (func.value == "\\matrix" || func.value == "\\pmatrix" || func.value == "\\cases") {
// if (!environments.hasOwnProperty(envName)) {
// throw new ParseError(
// "No such environment: " + envName,
// this.lexer, begin.value.namepos);
// }
// Build the environment object. Arguments and other information will
// be made available to the begin and end methods using properties.
envName = func.value.slice(1);
var env = environments[envName];
// var args = this.parseArguments("\\matrix{", env);
this.expect("{", true);
var context = {
mode: this.mode,
envName: envName,
parser: this,
lexer: this.lexer
var result = env.handler(context, {} );
// exit();
this.expect("}", true);
// var end = this.parseFunction();
var next = this.nextToken.text;
// exit();
// console.log(next);
// var name = ( + "")
// global_str = global_str.substring(0, global_str.length - (name.length * 2 + 2)) + name + "}"
// result.position = end.position;
return result;
} else if (utils.contains(sizeFuncs, func)) {
// If we see a sizing function, parse out the implict body
body = this.parseExpression(false);
return new ParseNode("sizing", {
// Figure out what size to use based on the list of functions above
original: func,
size: "size" + (utils.indexOf(sizeFuncs, func) + 1),
value: body,
}, this.mode);
} else if (utils.contains(styleFuncs, func)) {
// If we see a styling function, parse out the implict body
body = this.parseExpression(true);
return new ParseNode("styling", {
// Figure out what style to use by pulling out the style from
// the function name
original: func,
style: func.slice(1, func.length - 5),
value: body,
}, this.mode);
} else {
// Defer to parseFunction if it's not a function we handle
return this.parseFunction(start);
* Parses an entire function, including its base and all of its arguments.
* The base might either have been parsed already, in which case
* it is provided as an argument, or it's the next group in the input.
* @param {ParseFuncOrArgument=} baseGroup optional as described above
* @return {?ParseNode}
Parser.prototype.parseFunction = function(baseGroup) {
if (!baseGroup) {
baseGroup = this.parseGroup();
if (baseGroup) {
if (baseGroup.isFunction) {
var func = baseGroup.result;
var funcData = functions[func];
if (this.mode === "text" && !funcData.allowedInText) {
// throw new ParseError(
// "Can't use function '" + func + "' in text mode",
// this.lexer, baseGroup.position);
var args = this.parseArguments(func, funcData);
var result = this.callFunction(func, args, args.pop());
return new ParseNode(result.type, result, this.mode);
} else {
return baseGroup.result;
} else {
return null;
* Call a function handler with a suitable context and arguments.
Parser.prototype.callFunction = function(name, args, positions) {
var context = {
funcName: name,
parser: this,
lexer: this.lexer,
positions: positions,
return functions[name].handler(context, args);
* Parses the arguments of a function or environment
* @param {string} func "\name" or "\begin{name}"
* @param {{numArgs:number,numOptionalArgs:number|undefined}} funcData
* @return the array of arguments, with the list of positions as last element
Parser.prototype.parseArguments = function(func, funcData) {
var totalArgs = funcData.numArgs + funcData.numOptionalArgs;
if (totalArgs === 0) {
return [[this.pos]];
var baseGreediness = funcData.greediness;
var positions = [this.pos];
var args = [];
for (var i = 0; i < totalArgs; i++) {
var argType = funcData.argTypes && funcData.argTypes[i];
var arg;
if (i < funcData.numOptionalArgs) {
if (argType) {
arg = this.parseSpecialGroup(argType, true);
} else {
arg = this.parseOptionalGroup();
if (!arg) {
} else {
if (argType) {
arg = this.parseSpecialGroup(argType);
} else {
arg = this.parseGroup();
if (!arg) {
if (!this.settings.throwOnError &&
this.nextToken.text[0] === "\\") {
arg = new ParseFuncOrArgument(
} else {
throw new ParseError(
"Expected group after '" + func + "'",
this.lexer, this.pos);
var argNode;
if (arg.isFunction) {
var argGreediness =
if (argGreediness > baseGreediness) {
argNode = this.parseFunction(arg);
} else {
// throw new ParseError(
// "Got function '" + arg.result + "' as " +
// "argument to '" + func + "'",
// this.lexer, this.pos - 1);
} else {
argNode = arg.result;
return args;
* Parses a group when the mode is changing. Takes a position, a new mode, and
* an outer mode that is used to parse the outside.
* @return {?ParseFuncOrArgument}
Parser.prototype.parseSpecialGroup = function(innerMode, optional) {
var outerMode = this.mode;
// Handle `original` argTypes
if (innerMode === "original") {
innerMode = outerMode;
if (innerMode === "color" || innerMode === "size") {
// color and size modes are special because they should have braces and
// should only lex a single symbol inside
var openBrace = this.nextToken;
if (optional && openBrace.text !== "[") {
// optional arguments should return null if they don't exist
return null;
// The call to expect will lex the token after the '{' in inner mode
this.mode = innerMode;
this.expect(optional ? "[" : "{");
var inner = this.nextToken;
this.mode = outerMode;
var data;
if (innerMode === "color") {
data = inner.text;
} else {
data =;
this.consume(); // consume the token stored in inner
this.expect(optional ? "]" : "}");
return new ParseFuncOrArgument(
new ParseNode(innerMode, data, outerMode),
} else if (innerMode === "text") {
// text mode is special because it should ignore the whitespace before
// it
var whitespace = this.lexer.lex(this.pos, "whitespace");
this.pos = whitespace.position;
// By the time we get here, innerMode is one of "text" or "math".
// We switch the mode of the parser, recurse, then restore the old mode.
this.mode = innerMode;
this.nextToken = this.lexer.lex(this.pos, innerMode);
var res;
if (optional) {
res = this.parseOptionalGroup();
} else {
res = this.parseGroup();
this.mode = outerMode;
this.nextToken = this.lexer.lex(this.pos, outerMode);
return res;
* Parses a group, which is either a single nucleus (like "x") or an expression
* in braces (like "{x+y}")
* @return {?ParseFuncOrArgument}
Parser.prototype.parseGroup = function() {
// Try to parse an open brace
if (this.nextToken.text === "{") {
// If we get a brace, parse an expression
var expression = this.parseExpression(false);
// Make sure we get a close brace
return new ParseFuncOrArgument(
new ParseNode("ordgroup", expression, this.mode),
} else {
// Otherwise, just return a nucleus
return this.parseSymbol();
* Parses a group, which is an expression in brackets (like "[x+y]")
* @return {?ParseFuncOrArgument}
Parser.prototype.parseOptionalGroup = function() {
// Try to parse an open bracket
if (this.nextToken.text === "[") {
// If we get a brace, parse an expression
var expression = this.parseExpression(false, "]");
// Make sure we get a close bracket
return new ParseFuncOrArgument(
new ParseNode("ordgroup", expression, this.mode),
} else {
// Otherwise, return null,
return null;
* Parse a single symbol out of the string. Here, we handle both the functions
* we have defined, as well as the single character symbols
* @return {?ParseFuncOrArgument}
Parser.prototype.parseSymbol = function() {
var nucleus = this.nextToken;
if (functions[nucleus.text]) {
// If there exists a function with this name, we return the function and
// say that it is a function.
return new ParseFuncOrArgument(
} else if (symbols[this.mode][nucleus.text]) {
// Otherwise if this is a no-argument function, find the type it
// corresponds to in the symbols map
return new ParseFuncOrArgument(
new ParseNode(symbols[this.mode][nucleus.text].group,
nucleus.text, this.mode),
} else if (nucleus.text == "EOF" || nucleus.text == "{") {
return null;
} else {
// console.error(nucleus);
return new ParseFuncOrArgument(
new ParseNode(symbols["math"]["\\sigma"].group,
nucleus.text, this.mode),
// console.log(nucleus.text);
// return null;
Parser.prototype.ParseNode = ParseNode;
module.exports = Parser;