const { TokenTextSplitter } = require('langchain/text_splitter'); | |
/** | |
* Splits a given text by token chunks, based on the provided parameters for the TokenTextSplitter. | |
* Note: limit or memoize use of this function as its calculation is expensive. | |
* | |
* @param {Object} obj - Configuration object for the text splitting operation. | |
* @param {string} obj.text - The text to be split. | |
* @param {string} [obj.encodingName='cl100k_base'] - Encoding name. Defaults to 'cl100k_base'. | |
* @param {number} [obj.chunkSize=1] - The token size of each chunk. Defaults to 1. | |
* @param {number} [obj.chunkOverlap=0] - The number of chunk elements to be overlapped between adjacent chunks. Defaults to 0. | |
* @param {number} [obj.returnSize] - If specified and not 0, slices the return array from the end by this amount. | |
* | |
* @returns {Promise<Array>} Returns a promise that resolves to an array of text chunks. | |
* If no text is provided, an empty array is returned. | |
* If returnSize is specified and not 0, slices the return array from the end by returnSize. | |
* | |
* @async | |
* @function tokenSplit | |
*/ | |
async function tokenSplit({ | |
text, | |
encodingName = 'cl100k_base', | |
chunkSize = 1, | |
chunkOverlap = 0, | |
returnSize, | |
}) { | |
if (!text) { | |
return []; | |
} | |
const splitter = new TokenTextSplitter({ | |
encodingName, | |
chunkSize, | |
chunkOverlap, | |
}); | |
if (!returnSize) { | |
return await splitter.splitText(text); | |
} | |
const splitText = await splitter.splitText(text); | |
if (returnSize && returnSize > 0 && splitText.length > 0) { | |
return splitText.slice(-Math.abs(returnSize)); | |
} | |
return splitText; | |
} | |
module.exports = tokenSplit; | |