Spaces:

yuntian-deng
/

latex2im

Running on T4

App Files Files Community

latex2im / katex /src /Parser.js

da03

.

b498cbf 4 months ago

history blame contribute delete

26.4 kB

	/* eslint no-constant-condition:0 */
	var functions = require("./functions");
	var environments = require("./environments");
	var Lexer = require("./Lexer");
	var symbols = require("./symbols");
	var utils = require("./utils");

	var parseData = require("./parseData");
	var ParseError = require("./ParseError");

	global_str = ""

	/**
	* This file contains the parser used to parse out a TeX expression from the
	* input. Since TeX isn't context-free, standard parsers don't work particularly
	* well.
	*
	* The strategy of this parser is as such:
	*
	* The main functions (the `.parse...` ones) take a position in the current
	* parse string to parse tokens from. The lexer (found in Lexer.js, stored at
	* this.lexer) also supports pulling out tokens at arbitrary places. When
	* individual tokens are needed at a position, the lexer is called to pull out a
	* token, which is then used.
	*
	* The parser has a property called "mode" indicating the mode that
	* the parser is currently in. Currently it has to be one of "math" or
	* "text", which denotes whether the current environment is a math-y
	* one or a text-y one (e.g. inside \text). Currently, this serves to
	* limit the functions which can be used in text mode.
	*
	* The main functions then return an object which contains the useful data that
	* was parsed at its given point, and a new position at the end of the parsed
	* data. The main functions can call each other and continue the parsing by
	* using the returned position as a new starting point.
	*
	* There are also extra `.handle...` functions, which pull out some reused
	* functionality into self-contained functions.
	*
	* The earlier functions return ParseNodes.
	* The later functions (which are called deeper in the parse) sometimes return
	* ParseFuncOrArgument, which contain a ParseNode as well as some data about
	* whether the parsed object is a function which is missing some arguments, or a
	* standalone object which can be used as an argument to another function.
	*/

	/**
	* Main Parser class
	*/
	function Parser(input, settings) {
	// Make a new lexer
	this.lexer = new Lexer(input);
	// Store the settings for use in parsing
	this.settings = settings;
	}

	var ParseNode = parseData.ParseNode;

	/**
	* An initial function (without its arguments), or an argument to a function.
	* The `result` argument should be a ParseNode.
	*/
	function ParseFuncOrArgument(result, isFunction) {
	this.result = result;
	// Is this a function (i.e. is it something defined in functions.js)?
	this.isFunction = isFunction;
	}

	/**
	* Checks a result to make sure it has the right type, and throws an
	* appropriate error otherwise.
	*
	* @param {boolean=} consume whether to consume the expected token,
	* defaults to true
	*/
	Parser.prototype.expect = function(text, consume) {
	if (this.nextToken.text !== text) {
	throw new ParseError(
	"Expected '" + text + "', got '" + this.nextToken.text + "'",
	this.lexer, this.nextToken.position
	);
	}
	if (consume !== false) {
	this.consume();
	}
	};

	/**
	* Considers the current look ahead token as consumed,
	* and fetches the one after that as the new look ahead.
	*/
	Parser.prototype.consume = function() {
	this.pos = this.nextToken.position;

	global_str = global_str + " " + this.nextToken.text
	this.nextToken = this.lexer.lex(this.pos, this.mode);
	};

	/**
	* Main parsing function, which parses an entire input.
	*
	* @return {?Array.<ParseNode>}
	*/
	Parser.prototype.parse = function() {
	// Try to parse the input
	this.mode = "math";
	this.pos = 0;
	this.nextToken = this.lexer.lex(this.pos, this.mode);
	var parse = this.parseInput();
	return parse;
	};

	/**
	* Parses an entire input tree.
	*/
	Parser.prototype.parseInput = function() {
	// Parse an expression
	var expression = this.parseExpression(false);
	// If we succeeded, make sure there's an EOF at the end
	this.expect("EOF", false);
	return expression;
	};

	var endOfExpression = ["}", "\\end", "\\right", "&", "\\\\", "\\cr"];

	/**
	* Parses an "expression", which is a list of atoms.
	*
	* @param {boolean} breakOnInfix Should the parsing stop when we hit infix
	* nodes? This happens when functions have higher precendence
	* than infix nodes in implicit parses.
	*
	* @param {?string} breakOnToken The token that the expression should end with,
	* or `null` if something else should end the expression.
	*
	* @return {ParseNode}
	*/
	Parser.prototype.parseExpression = function(breakOnInfix, breakOnToken) {
	var body = [];
	// Keep adding atoms to the body until we can't parse any more atoms (either
	// we reached the end, a }, or a \right)
	while (true) {
	var lex = this.nextToken;
	var pos = this.pos;
	if (endOfExpression.indexOf(lex.text) !== -1) {
	break;
	}
	if (breakOnToken && lex.text === breakOnToken) {
	break;
	}
	var atom = this.parseAtom();
	if (!atom) {
	if (!this.settings.throwOnError && lex.text[0] === "\\") {
	var errorNode = this.handleUnsupportedCmd();
	body.push(errorNode);

	pos = lex.position;
	continue;
	}

	break;
	}
	if (breakOnInfix && atom.type === "infix") {
	// rewind so we can parse the infix atom again
	this.pos = pos;
	this.nextToken = lex;
	break;
	}
	body.push(atom);
	}
	return this.handleInfixNodes(body);
	};

	/**
	* Rewrites infix operators such as \over with corresponding commands such
	* as \frac.
	*
	* There can only be one infix operator per group. If there's more than one
	* then the expression is ambiguous. This can be resolved by adding {}.
	*
	* @returns {Array}
	*/
	Parser.prototype.handleInfixNodes = function(body) {
	var overIndex = -1;
	var funcName;

	for (var i = 0; i < body.length; i++) {
	var node = body[i];
	if (node.type === "infix") {
	if (overIndex !== -1) {
	throw new ParseError("only one infix operator per group",
	this.lexer, -1);
	}
	overIndex = i;
	funcName = node.value.replaceWith;
	}
	}

	if (overIndex !== -1) {
	var numerNode;
	var denomNode;

	var numerBody = body.slice(0, overIndex);
	var denomBody = body.slice(overIndex + 1);

	if (numerBody.length === 1 && numerBody[0].type === "ordgroup") {
	numerNode = numerBody[0];
	} else {
	numerNode = new ParseNode("ordgroup", numerBody, this.mode);
	}

	if (denomBody.length === 1 && denomBody[0].type === "ordgroup") {
	denomNode = denomBody[0];
	} else {
	denomNode = new ParseNode("ordgroup", denomBody, this.mode);
	}

	var value = this.callFunction(
	funcName, [numerNode, denomNode], null);
	return [new ParseNode(value.type, value, this.mode)];
	} else {
	return body;
	}
	};

	// The greediness of a superscript or subscript
	var SUPSUB_GREEDINESS = 1;

	/**
	* Handle a subscript or superscript with nice errors.
	*/
	Parser.prototype.handleSupSubscript = function(name) {
	var symbol = this.nextToken.text;
	var symPos = this.pos;
	this.consume();
	var group = this.parseGroup();

	if (!group) {
	if (!this.settings.throwOnError && this.nextToken.text[0] === "\\") {
	return this.handleUnsupportedCmd();
	} else {
	// throw new ParseError(
	// "Expected group after '" + symbol + "'",
	// this.lexer,
	// symPos + 1
	// );
	}
	} else if (group.isFunction) {
	// ^ and _ have a greediness, so handle interactions with functions'
	// greediness
	var funcGreediness = functions[group.result].greediness;
	if (funcGreediness > SUPSUB_GREEDINESS) {
	return this.parseFunction(group);
	} else {
	throw new ParseError(
	"Got function '" + group.result + "' with no arguments " +
	"as " + name,
	this.lexer, symPos + 1);
	}
	} else {
	return group.result;
	}
	};

	/**
	* Converts the textual input of an unsupported command into a text node
	* contained within a color node whose color is determined by errorColor
	*/
	Parser.prototype.handleUnsupportedCmd = function() {
	var text = this.nextToken.text;
	var textordArray = [];

	for (var i = 0; i < text.length; i++) {
	textordArray.push(new ParseNode("textord", text[i], "text"));
	}

	var textNode = new ParseNode(
	"text",
	{
	body: textordArray,
	type: "text",
	},
	this.mode);

	var colorNode = new ParseNode(
	"color",
	{
	color: this.settings.errorColor,
	value: [textNode],
	type: "color",
	},
	this.mode);

	this.consume();
	return colorNode;
	};

	/**
	* Parses a group with optional super/subscripts.
	*
	* @return {?ParseNode}
	*/
	Parser.prototype.parseAtom = function() {
	// The body of an atom is an implicit group, so that things like
	// \left(x\right)^2 work correctly.
	var base = this.parseImplicitGroup();

	// In text mode, we don't have superscripts or subscripts
	if (this.mode === "text") {
	return base;
	}

	// Note that base may be empty (i.e. null) at this point.

	var superscript;
	var subscript;
	while (true) {
	// Lex the first token
	var lex = this.nextToken;

	if (lex.text === "\\limits" \|\| lex.text === "\\nolimits") {
	// We got a limit control
	if (!base \|\| base.type !== "op") {
	throw new ParseError(
	"Limit controls must follow a math operator",
	this.lexer, this.pos);
	} else {
	var limits = lex.text === "\\limits";
	base.value.limits = limits;
	base.value.alwaysHandleSupSub = true;
	}
	this.consume();
	} else if (lex.text === "^") {
	// We got a superscript start
	// if (superscript) {
	// throw new ParseError(
	// "Double superscript", this.lexer, this.pos);
	// }
	superscript = this.handleSupSubscript("superscript");
	} else if (lex.text === "_") {
	// We got a subscript start
	// if (subscript) {
	// throw new ParseError(
	// "Double subscript", this.lexer, this.pos);
	// }
	subscript = this.handleSupSubscript("subscript");
	} else if (lex.text === "'") {
	// We got a prime
	var prime = new ParseNode("textord", "\\prime", this.mode);

	// Many primes can be grouped together, so we handle this here
	var primes = [prime];
	this.consume();
	// Keep lexing tokens until we get something that's not a prime
	while (this.nextToken.text === "'") {
	// For each one, add another prime to the list
	primes.push(prime);
	this.consume();
	}
	// Put them into an ordgroup as the superscript
	superscript = new ParseNode("ordgroup", primes, this.mode);
	} else {
	// If it wasn't ^, _, or ', stop parsing super/subscripts
	break;
	}
	}

	if (superscript \|\| subscript) {
	// If we got either a superscript or subscript, create a supsub
	return new ParseNode("supsub", {
	base: base,
	sup: superscript,
	sub: subscript,
	}, this.mode);
	} else {
	// Otherwise return the original body
	return base;
	}
	};

	// A list of the size-changing functions, for use in parseImplicitGroup
	var sizeFuncs = [
	"\\tiny", "\\scriptsize", "\\footnotesize", "\\small", "\\normalsize",
	"\\large", "\\Large", "\\LARGE", "\\huge", "\\Huge", "\\textrm", "\\rm", "\\cal",
	"\\bf", "\\siptstyle", "\\boldmath", "\\it"
	];

	// A list of the style-changing functions, for use in parseImplicitGroup
	var styleFuncs = [
	"\\displaystyle", "\\textstyle", "\\scriptstyle", "\\scriptscriptstyle",
	];

	/**
	* Parses an implicit group, which is a group that starts at the end of a
	* specified, and ends right before a higher explicit group ends, or at EOL. It
	* is used for functions that appear to affect the current style, like \Large or
	* \textrm, where instead of keeping a style we just pretend that there is an
	* implicit grouping after it until the end of the group. E.g.
	* small text {\Large large text} small text again
	* It is also used for \left and \right to get the correct grouping.
	*
	* @return {?ParseNode}
	*/
	Parser.prototype.parseImplicitGroup = function() {
	var start = this.parseSymbol();

	if (start == null) {
	// If we didn't get anything we handle, fall back to parseFunction
	return this.parseFunction();
	}

	var func = start.result;
	var body;
	if (func === "\\left") {
	// If we see a left:
	// Parse the entire left function (including the delimiter)
	var left = this.parseFunction(start);
	// Parse out the implicit body
	body = this.parseExpression(false);
	// Check the next token
	this.expect("\\right", false);
	var right = this.parseFunction();
	return new ParseNode("leftright", {
	body: body,
	left: left.value.value,
	right: right.value.value,
	}, this.mode);
	} else if (func === "\\begin") {
	// begin...end is similar to left...right
	var begin = this.parseFunction(start);
	var envName = begin.value.name;
	var name = (begin.value.name + "")

	global_str = global_str.substring(0, global_str.length - (name.length * 2 + 2)) + name + "}"

	if (!environments.hasOwnProperty(envName)) {
	throw new ParseError(
	"No such environment: " + envName,
	this.lexer, begin.value.namepos);
	}
	// Build the environment object. Arguments and other information will
	// be made available to the begin and end methods using properties.
	var env = environments[envName];
	var args = this.parseArguments("\\begin{" + envName + "}", env);
	var context = {
	mode: this.mode,
	envName: envName,
	parser: this,
	lexer: this.lexer,
	positions: args.pop(),
	};
	var result = env.handler(context, args);
	this.expect("\\end", false);
	var end = this.parseFunction();

	var name = (begin.value.name + "")

	global_str = global_str.substring(0, global_str.length - (name.length * 2 + 2)) + name + "}"
	if (end.value.name !== envName) {
	throw new ParseError(
	"Mismatch: \\begin{" + envName + "} matched " +
	"by \\end{" + end.value.name + "}",
	this.lexer /* , end.value.namepos */);
	// TODO: Add position to the above line and adjust test case,
	// requires #385 to get merged first
	}
	result.position = end.position;

	return result;

	} else if (func.value == "\\matrix" \|\| func.value == "\\pmatrix" \|\| func.value == "\\cases") {
	// if (!environments.hasOwnProperty(envName)) {
	// throw new ParseError(
	// "No such environment: " + envName,
	// this.lexer, begin.value.namepos);
	// }
	// Build the environment object. Arguments and other information will
	// be made available to the begin and end methods using properties.

	envName = func.value.slice(1);
	var env = environments[envName];
	// var args = this.parseArguments("\\matrix{", env);
	this.expect("{", true);
	var context = {
	mode: this.mode,
	envName: envName,
	parser: this,
	lexer: this.lexer
	};

	var result = env.handler(context, {} );
	// exit();
	this.expect("}", true);
	// var end = this.parseFunction();
	var next = this.nextToken.text;
	// exit();
	// console.log(next);
	// var name = ( + "")

	// global_str = global_str.substring(0, global_str.length - (name.length * 2 + 2)) + name + "}"
	// result.position = end.position;

	return result;

	} else if (utils.contains(sizeFuncs, func)) {
	// If we see a sizing function, parse out the implict body
	body = this.parseExpression(false);

	return new ParseNode("sizing", {
	// Figure out what size to use based on the list of functions above
	original: func,
	size: "size" + (utils.indexOf(sizeFuncs, func) + 1),
	value: body,
	}, this.mode);
	} else if (utils.contains(styleFuncs, func)) {
	// If we see a styling function, parse out the implict body
	body = this.parseExpression(true);
	return new ParseNode("styling", {
	// Figure out what style to use by pulling out the style from
	// the function name
	original: func,
	style: func.slice(1, func.length - 5),
	value: body,
	}, this.mode);
	} else {
	// Defer to parseFunction if it's not a function we handle
	return this.parseFunction(start);
	}
	};

	/**
	* Parses an entire function, including its base and all of its arguments.
	* The base might either have been parsed already, in which case
	* it is provided as an argument, or it's the next group in the input.
	*
	* @param {ParseFuncOrArgument=} baseGroup optional as described above
	* @return {?ParseNode}
	*/
	Parser.prototype.parseFunction = function(baseGroup) {
	if (!baseGroup) {
	baseGroup = this.parseGroup();
	}

	if (baseGroup) {
	if (baseGroup.isFunction) {
	var func = baseGroup.result;
	var funcData = functions[func];
	if (this.mode === "text" && !funcData.allowedInText) {
	// throw new ParseError(
	// "Can't use function '" + func + "' in text mode",
	// this.lexer, baseGroup.position);
	}

	var args = this.parseArguments(func, funcData);
	var result = this.callFunction(func, args, args.pop());
	return new ParseNode(result.type, result, this.mode);
	} else {
	return baseGroup.result;
	}
	} else {
	return null;
	}
	};

	/**
	* Call a function handler with a suitable context and arguments.
	*/
	Parser.prototype.callFunction = function(name, args, positions) {
	var context = {
	funcName: name,
	parser: this,
	lexer: this.lexer,
	positions: positions,
	};
	return functions[name].handler(context, args);
	};

	/**
	* Parses the arguments of a function or environment
	*
	* @param {string} func "\name" or "\begin{name}"
	* @param {{numArgs:number,numOptionalArgs:number\|undefined}} funcData
	* @return the array of arguments, with the list of positions as last element
	*/
	Parser.prototype.parseArguments = function(func, funcData) {
	var totalArgs = funcData.numArgs + funcData.numOptionalArgs;
	if (totalArgs === 0) {
	return [[this.pos]];
	}

	var baseGreediness = funcData.greediness;
	var positions = [this.pos];
	var args = [];

	for (var i = 0; i < totalArgs; i++) {
	var argType = funcData.argTypes && funcData.argTypes[i];
	var arg;
	if (i < funcData.numOptionalArgs) {
	if (argType) {
	arg = this.parseSpecialGroup(argType, true);
	} else {
	arg = this.parseOptionalGroup();
	}
	if (!arg) {
	args.push(null);
	positions.push(this.pos);
	continue;
	}
	} else {
	if (argType) {
	arg = this.parseSpecialGroup(argType);
	} else {
	arg = this.parseGroup();
	}
	if (!arg) {
	if (!this.settings.throwOnError &&
	this.nextToken.text[0] === "\\") {
	arg = new ParseFuncOrArgument(
	this.handleUnsupportedCmd(this.nextToken.text),
	false);
	} else {
	throw new ParseError(
	"Expected group after '" + func + "'",
	this.lexer, this.pos);
	}
	}
	}
	var argNode;
	if (arg.isFunction) {
	var argGreediness =
	functions[arg.result].greediness;
	if (argGreediness > baseGreediness) {
	argNode = this.parseFunction(arg);
	} else {
	// throw new ParseError(
	// "Got function '" + arg.result + "' as " +
	// "argument to '" + func + "'",
	// this.lexer, this.pos - 1);
	}
	} else {
	argNode = arg.result;
	}
	args.push(argNode);
	positions.push(this.pos);
	}

	args.push(positions);

	return args;
	};


	/**
	* Parses a group when the mode is changing. Takes a position, a new mode, and
	* an outer mode that is used to parse the outside.
	*
	* @return {?ParseFuncOrArgument}
	*/
	Parser.prototype.parseSpecialGroup = function(innerMode, optional) {
	var outerMode = this.mode;
	// Handle `original` argTypes
	if (innerMode === "original") {
	innerMode = outerMode;
	}

	if (innerMode === "color" \|\| innerMode === "size") {
	// color and size modes are special because they should have braces and
	// should only lex a single symbol inside
	var openBrace = this.nextToken;
	if (optional && openBrace.text !== "[") {
	// optional arguments should return null if they don't exist
	return null;
	}
	// The call to expect will lex the token after the '{' in inner mode
	this.mode = innerMode;
	this.expect(optional ? "[" : "{");
	var inner = this.nextToken;
	this.mode = outerMode;
	var data;
	if (innerMode === "color") {
	data = inner.text;
	} else {
	data = inner.data;
	}
	this.consume(); // consume the token stored in inner
	this.expect(optional ? "]" : "}");
	return new ParseFuncOrArgument(
	new ParseNode(innerMode, data, outerMode),
	false);
	} else if (innerMode === "text") {
	// text mode is special because it should ignore the whitespace before
	// it
	var whitespace = this.lexer.lex(this.pos, "whitespace");
	this.pos = whitespace.position;
	}

	// By the time we get here, innerMode is one of "text" or "math".
	// We switch the mode of the parser, recurse, then restore the old mode.
	this.mode = innerMode;
	this.nextToken = this.lexer.lex(this.pos, innerMode);
	var res;
	if (optional) {
	res = this.parseOptionalGroup();
	} else {
	res = this.parseGroup();
	}
	this.mode = outerMode;
	this.nextToken = this.lexer.lex(this.pos, outerMode);
	return res;
	};

	/**
	* Parses a group, which is either a single nucleus (like "x") or an expression
	* in braces (like "{x+y}")
	*
	* @return {?ParseFuncOrArgument}
	*/
	Parser.prototype.parseGroup = function() {
	// Try to parse an open brace
	if (this.nextToken.text === "{") {
	// If we get a brace, parse an expression
	this.consume();
	var expression = this.parseExpression(false);
	// Make sure we get a close brace
	this.expect("}");
	return new ParseFuncOrArgument(
	new ParseNode("ordgroup", expression, this.mode),
	false);
	} else {
	// Otherwise, just return a nucleus
	return this.parseSymbol();
	}
	};

	/**
	* Parses a group, which is an expression in brackets (like "[x+y]")
	*
	* @return {?ParseFuncOrArgument}
	*/
	Parser.prototype.parseOptionalGroup = function() {
	// Try to parse an open bracket
	if (this.nextToken.text === "[") {
	// If we get a brace, parse an expression
	this.consume();
	var expression = this.parseExpression(false, "]");
	// Make sure we get a close bracket
	this.expect("]");
	return new ParseFuncOrArgument(
	new ParseNode("ordgroup", expression, this.mode),
	false);
	} else {
	// Otherwise, return null,
	return null;
	}
	};

	/**
	* Parse a single symbol out of the string. Here, we handle both the functions
	* we have defined, as well as the single character symbols
	*
	* @return {?ParseFuncOrArgument}
	*/
	Parser.prototype.parseSymbol = function() {
	var nucleus = this.nextToken;

	if (functions[nucleus.text]) {
	this.consume();
	// If there exists a function with this name, we return the function and
	// say that it is a function.
	return new ParseFuncOrArgument(
	nucleus.text,
	true);
	} else if (symbols[this.mode][nucleus.text]) {
	this.consume();
	// Otherwise if this is a no-argument function, find the type it
	// corresponds to in the symbols map
	return new ParseFuncOrArgument(
	new ParseNode(symbols[this.mode][nucleus.text].group,
	nucleus.text, this.mode),
	false);
	} else if (nucleus.text == "EOF" \|\| nucleus.text == "{") {
	return null;

	} else {
	this.consume();
	// console.error(nucleus);
	return new ParseFuncOrArgument(
	new ParseNode(symbols["math"]["\\sigma"].group,
	nucleus.text, this.mode),
	false);
	// console.log(nucleus.text);
	// return null;
	}
	};

	Parser.prototype.ParseNode = ParseNode;

	module.exports = Parser;