"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.applyCodestralChatTemplate = exports.openaiGetHeaderStringForMessage = exports.openaiGetHeaderTokensForMessage = exports.getTokenizerByName_ONLY_FOR_OPENAI_TOKENIZERS = exports.CODESTRAL_ONLY_USE_ESTIMATE_NUM_TOKENS_FAST = exports.O200K_SPECIAL_TOKENS = exports.O200K = exports.CL100K_SPECIAL_TOKENS = exports.CL100K = void 0;
exports.numTokens = numTokens;
exports.estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL = estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL;
exports.estimateNumTokensFast = estimateNumTokensFast;
exports.encodeTokens = encodeTokens;
exports.decodeTokens = decodeTokens;
exports.estimateTokensUsingBytecount = estimateTokensUsingBytecount;
exports.estimateTokensUsingCharcount = estimateTokensUsingCharcount;
exports.numTokensForImage = numTokensForImage;
const CL100K_BASE = 'cl100k_base';
const R50K_BASE = 'r50k_base';
const P50K_BASE = 'p50k_base';
const GPT2_TOKENIZER = 'gpt2';
const LLAMA3_TOKENIZER = 'llama3';
const O200K_BASE = 'o200k_base';
const CODESTRAL_BASE = 'codestral';
const usableTokenizers = [
    CL100K_BASE,
    'cl100k_base_special_tokens',
    R50K_BASE,
    P50K_BASE,
    GPT2_TOKENIZER,
    LLAMA3_TOKENIZER,
    O200K_BASE,
    'o200k_base_special_tokens',
    CODESTRAL_BASE
];
function contentArrayToStringContent(content) {
    const newContent = [];
    content.forEach((c) => {
        if (typeof c === 'string') {
            newContent.push(c);
        }
        else if (c.type === 'text') {
            newContent.push(c.text);
        }
        else if (c.type === 'image_url') {
            // Do nothing with images
        }
    });
    return newContent;
}
function openaiChatMessagesToPrompt(messages, tokenizer) {
    if (tokenizer !== 'o200k_base' &&
        tokenizer !== 'cl100k_base' &&
        tokenizer !== 'cl100k_base_special_tokens' &&
        tokenizer !== 'o200k_base_special_tokens') {
        throw new Error(`Invalid tokenizer: ${tokenizer}. Only o200k_base, cl100k_base, and cl100k_base_special_tokens tokenizers are supported.`);
    }
    const parts = messages.map((msg, i) => {
        const headerString = (0, exports.openaiGetHeaderStringForMessage)(msg, tokenizer);
        let newContent;
        if (Array.isArray(msg.content)) {
            newContent = contentArrayToStringContent(msg.content).join('');
        }
        else {
            newContent = msg.content;
        }
        if (i !== 0) {
            // Openai always adds the eos token before every non-starting message
            const endTokenString = tokenizer === O200K_BASE || tokenizer === 'o200k_base_special_tokens'
                ? O200K_END_TOKEN_STRING
                : CL100K_END_TOKEN_STRING;
            return endTokenString + headerString + newContent;
        }
        else {
            return headerString + newContent;
        }
    });
    return parts.join('');
}
async function openaiChatMessagesToTokens(messages, tokenizer) {
    if (tokenizer !== 'o200k_base' &&
        tokenizer !== 'cl100k_base' &&
        tokenizer !== 'cl100k_base_special_tokens' &&
        tokenizer !== 'o200k_base_special_tokens') {
        throw new Error(`Invalid tokenizer: ${tokenizer}. Only o200k_base, cl100k_base, and cl100k_base_special_tokens, o200k_base_special_tokens tokenizers are supported.`);
    }
    const parts = await Promise.all(messages.map(async (msg, i) => {
        const headerTokens = await (0, exports.openaiGetHeaderTokensForMessage)(msg, tokenizer);
        let contentTokens;
        if (Array.isArray(msg.content)) {
            const stringContentArray = contentArrayToStringContent(msg.content);
            contentTokens = (await Promise.all(stringContentArray.map((content) => encodeTokens(content, { tokenizer: tokenizer })))).flat();
        }
        else {
            contentTokens = await encodeTokens(msg.content, {
                tokenizer: tokenizer
            });
        }
        if (i !== 0) {
            // Openai always adds the eos token before every non-starting message
            const eosTokenId = tokenizer === 'o200k_base' ||
                tokenizer === 'o200k_base_special_tokens'
                ? O200K_END_TOKEN
                : CL100K_END_TOKEN;
            return [eosTokenId, ...headerTokens, ...contentTokens];
        }
        else {
            return [...headerTokens, ...contentTokens];
        }
    }));
    return parts.flat();
}
exports.CL100K = {
    name: 'cl100k_base',
    encodeTokens: (text) => encodeTokens(text, { tokenizer: 'cl100k_base' }),
    decodeTokens: (tokens) => decodeTokens(tokens, { tokenizer: 'cl100k_base' }),
    numTokens: (text) => numTokens(text, { tokenizer: 'cl100k_base' }),
    estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL: (text) => estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL(text, {
        tokenizer: 'cl100k_base'
    }),
    estimateNumTokensFast: (text) => estimateNumTokensFast(text, { tokenizer: 'cl100k_base' }),
    estimateTokensUsingCharCount: (text) => estimateTokensUsingCharcount(text, 'cl100k_base'),
    getEosToken: () => CL100K_END_TOKEN_STRING,
    getEosTokenId: () => CL100K_END_TOKEN,
    getHeaderStringForMessage: (message) => (0, exports.openaiGetHeaderStringForMessage)(message, 'cl100k_base'),
    getHeaderTokensForMessage: (message) => (0, exports.openaiGetHeaderTokensForMessage)(message, 'cl100k_base'),
    applyChatTemplate: (messages) => openaiChatMessagesToPrompt(messages, 'cl100k_base'),
    applyChatTemplateTokens: async (messages) => openaiChatMessagesToTokens(messages, 'cl100k_base'),
    shouldAddEosTokenToEachMessage: true
};
exports.CL100K_SPECIAL_TOKENS = {
    name: 'cl100k_base_special_tokens',
    encodeTokens: (text) => encodeTokens(text, { tokenizer: 'cl100k_base_special_tokens' }),
    decodeTokens: (tokens) => decodeTokens(tokens, { tokenizer: 'cl100k_base_special_tokens' }),
    numTokens: (text) => numTokens(text, { tokenizer: 'cl100k_base_special_tokens' }),
    estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL: (text) => estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL(text, {
        tokenizer: 'cl100k_base_special_tokens'
    }),
    estimateNumTokensFast: (text) => estimateNumTokensFast(text, { tokenizer: 'cl100k_base_special_tokens' }),
    estimateTokensUsingCharCount: (text) => estimateTokensUsingCharcount(text, 'cl100k_base_special_tokens'),
    getEosToken: () => CL100K_END_TOKEN_STRING,
    getEosTokenId: () => CL100K_END_TOKEN,
    getHeaderStringForMessage: (message) => (0, exports.openaiGetHeaderStringForMessage)(message, 'cl100k_base_special_tokens'),
    getHeaderTokensForMessage: (message) => (0, exports.openaiGetHeaderTokensForMessage)(message, 'cl100k_base_special_tokens'),
    applyChatTemplate: (messages) => openaiChatMessagesToPrompt(messages, 'cl100k_base_special_tokens'),
    applyChatTemplateTokens: async (messages) => openaiChatMessagesToTokens(messages, 'cl100k_base_special_tokens'),
    shouldAddEosTokenToEachMessage: true
};
exports.O200K = {
    name: 'o200k_base',
    encodeTokens: (text) => encodeTokens(text, { tokenizer: 'o200k_base' }),
    decodeTokens: (tokens) => decodeTokens(tokens, { tokenizer: 'o200k_base' }),
    numTokens: (text) => numTokens(text, { tokenizer: 'o200k_base' }),
    estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL: (text) => estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL(text, {
        tokenizer: 'o200k_base'
    }),
    estimateNumTokensFast: (text) => estimateNumTokensFast(text, { tokenizer: 'o200k_base' }),
    estimateTokensUsingCharCount: (text) => estimateTokensUsingCharcount(text, 'o200k_base'),
    getEosToken: () => O200K_END_TOKEN_STRING,
    getEosTokenId: () => O200K_END_TOKEN,
    getHeaderStringForMessage: (message) => (0, exports.openaiGetHeaderStringForMessage)(message, 'o200k_base'),
    getHeaderTokensForMessage: (message) => (0, exports.openaiGetHeaderTokensForMessage)(message, 'o200k_base'),
    applyChatTemplate: (messages) => openaiChatMessagesToPrompt(messages, 'o200k_base'),
    applyChatTemplateTokens: async (messages) => openaiChatMessagesToTokens(messages, 'o200k_base'),
    shouldAddEosTokenToEachMessage: true
};
exports.O200K_SPECIAL_TOKENS = {
    name: 'o200k_base_special_tokens',
    encodeTokens: (text) => encodeTokens(text, { tokenizer: 'o200k_base_special_tokens' }),
    decodeTokens: (tokens) => decodeTokens(tokens, { tokenizer: 'o200k_base_special_tokens' }),
    numTokens: (text) => numTokens(text, { tokenizer: 'o200k_base_special_tokens' }),
    estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL: (text) => estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL(text, {
        tokenizer: 'o200k_base_special_tokens'
    }),
    estimateNumTokensFast: (text) => estimateNumTokensFast(text, { tokenizer: 'o200k_base_special_tokens' }),
    estimateTokensUsingCharCount: (text) => estimateTokensUsingCharcount(text, 'o200k_base_special_tokens'),
    getEosToken: () => O200K_END_TOKEN_STRING,
    getEosTokenId: () => O200K_END_TOKEN,
    getHeaderStringForMessage: (message) => (0, exports.openaiGetHeaderStringForMessage)(message, 'o200k_base_special_tokens'),
    getHeaderTokensForMessage: (message) => (0, exports.openaiGetHeaderTokensForMessage)(message, 'o200k_base_special_tokens'),
    applyChatTemplate: (messages) => openaiChatMessagesToPrompt(messages, 'o200k_base_special_tokens'),
    applyChatTemplateTokens: async (messages) => openaiChatMessagesToTokens(messages, 'o200k_base_special_tokens'),
    shouldAddEosTokenToEachMessage: true
};
const CODESTRAL_BOS_TOKEN = '<s>';
const CODESTRAL_EOS_TOKEN = '</s>';
const CODESTRAL_EOS_TOKEN_ID = 2;
exports.CODESTRAL_ONLY_USE_ESTIMATE_NUM_TOKENS_FAST = {
    name: 'codestral',
    encodeTokens: (text) => encodeTokens(text, { tokenizer: 'codestral' }),
    decodeTokens: (tokens) => decodeTokens(tokens, { tokenizer: 'codestral' }),
    numTokens: (text) => numTokens(text, { tokenizer: 'codestral' }),
    estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL: (text) => estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL(text, {
        tokenizer: 'codestral'
    }),
    estimateNumTokensFast: (text) => estimateNumTokensFast(text, { tokenizer: 'codestral' }),
    estimateTokensUsingCharCount: (text) => estimateTokensUsingCharcount(text, 'codestral'),
    getEosToken: () => CODESTRAL_EOS_TOKEN,
    getEosTokenId: () => CODESTRAL_EOS_TOKEN_ID,
    getHeaderStringForMessage: (message) => (0, exports.openaiGetHeaderStringForMessage)(message, 'codestral'),
    getHeaderTokensForMessage: (message) => (0, exports.openaiGetHeaderTokensForMessage)(message, 'codestral'),
    applyChatTemplate: (messages) => (0, exports.applyCodestralChatTemplate)(messages),
    applyChatTemplateTokens: async (messages) => openaiChatMessagesToTokens(messages, 'codestral'),
    shouldAddEosTokenToEachMessage: true
};
const getTokenizerByName_ONLY_FOR_OPENAI_TOKENIZERS = (name) => {
    switch (name) {
        case 'cl100k_base':
            return exports.CL100K;
        case 'cl100k_base_special_tokens':
            return exports.CL100K_SPECIAL_TOKENS;
        case 'o200k_base':
            return exports.O200K;
        case 'codestral':
            return exports.CODESTRAL_ONLY_USE_ESTIMATE_NUM_TOKENS_FAST;
        default:
            throw new Error(`Unknown tokenizer ${name}`);
    }
};
exports.getTokenizerByName_ONLY_FOR_OPENAI_TOKENIZERS = getTokenizerByName_ONLY_FOR_OPENAI_TOKENIZERS;
async function numTokens(text, opts) {
    return estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL(text, opts);
}
// if you tokenize a lot of tokens, this can block the event loop
// only use this in a data job or with very few tokens
function estimateNumTokensFast_SYNCHRONOUS_BE_CAREFUL(text, opts) {
    const [lower, upper] = estimateTokensUsingCharcount(text, opts.tokenizer);
    return Math.floor((upper - lower) / 2 + lower);
}
async function estimateNumTokensFast(text, opts) {
    return numTokens(text, opts);
}
async function encodeTokens(text, opts) {
    return [];
}
async function decodeTokens(tokens, opts) {
    return '';
}
const encoder = new TextEncoder();
// returns a very conservative [lower, upper] bound on the number of tokens
function estimateTokensUsingBytecount(text, tokenizer) {
    const byteLength = encoder.encode(text).length;
    switch (tokenizer) {
        case 'cl100k_base':
        case 'cl100k_base_special_tokens':
        case 'o200k_base':
        case 'o200k_base_special_tokens':
            return [byteLength / 10, byteLength / 2.5];
        default:
            // conservative!
            return [byteLength / 10, byteLength / 2];
    }
}
function estimateTokensUsingCharcount(text, tokenizer) {
    const length = text.length;
    switch (tokenizer) {
        case 'cl100k_base':
        case 'cl100k_base_special_tokens':
        case 'o200k_base':
        case 'o200k_base_special_tokens':
            return [length / 10, length / 1.5];
        default:
            // conservative!
            return [length / 10, length];
    }
}
function numTokensForImage(dimensions, detail) {
    if (detail === 'low') {
        return 85;
    }
    else if (detail === 'high' || detail === 'auto') {
        // First, we rescale to fit within 2048 x 2048
        const largestRatio = Math.max(dimensions.width / 2048, dimensions.height / 2048);
        if (largestRatio > 1) {
            dimensions.width = Math.floor(dimensions.width / largestRatio);
            dimensions.height = Math.floor(dimensions.height / largestRatio);
        }
        // Next, we scale the shortest side to be 768 px
        const smallestRatio = Math.min(dimensions.width / 768, dimensions.height / 768);
        dimensions.width = Math.floor(dimensions.width / smallestRatio);
        dimensions.height = Math.floor(dimensions.height / smallestRatio);
        // Finally, we calculate the number of 512 x 512 blocks needed to cover the image
        // and pay 85 tokens per block
        const numWidthBlocks = Math.ceil(dimensions.width / 512);
        const numHeightBlocks = Math.ceil(dimensions.height / 512);
        return numWidthBlocks * numHeightBlocks * 85;
    }
    else {
        throw new Error(`Unknown detail level ${detail}`);
    }
}
const CL100K_SYSTEM_TOKENS = [100264, 9125, 100266];
const CL100K_USER_TOKENS = [100264, 882, 100266];
const CL100K_TOOL_TOKENS = [100264, 14506, 100266];
const CL100K_ASSISTANT_TOKENS = [100264, 78191, 100266];
const CL100K_END_TOKEN = 100265;
const CL100K_SYSTEM_TOKENS_STRING = '<|im_start|>system<|im_sep|>';
const CL100K_USER_TOKENS_STRING = '<|im_start|>user<|im_sep|>';
const CL100K_ASSISTANT_TOKENS_STRING = '<|im_start|>assistant<|im_sep|>';
const CL100K_END_TOKEN_STRING = '<|im_end|>';
const O200K_SYSTEM_TOKENS = [200006, 17360, 200008];
const O200K_USER_TOKENS = [200006, 1428, 200008];
const O200K_TOOL_TOKENS = [200006, 17952, 200008];
const O200K_ASSISTANT_TOKENS = [200006, 173781, 200008];
const O200K_END_TOKEN = 200007;
const O200K_SYSTEM_TOKENS_STRING = '<|im_start|>system<|im_sep|>';
const O200K_USER_TOKENS_STRING = '<|im_start|>user<|im_sep|>';
const O200K_TOOL_TOKENS_STRING = '<|im_start|>tool<|im_sep|>';
const O200K_ASSISTANT_TOKENS_STRING = '<|im_start|>assistant<|im_sep|>';
const O200K_END_TOKEN_STRING = '<|im_end|>';
async function injectName(tokens, name, tokenizer) {
    // i don't really know if this is the right way to format it....
    const nameTokens = await encodeTokens(':' + name, { tokenizer: tokenizer });
    return [...tokens.slice(0, -1), ...nameTokens, tokens[tokens.length - 1]];
}
async function injectTo(tokens, to, tokenizer) {
    // Adjusting the function to handle 'to' parameter injection
    const toTokens = await encodeTokens(' to=' + to, { tokenizer: tokenizer });
    return [...tokens.slice(0, -1), ...toTokens, tokens[tokens.length - 1]];
}
function injectNameString(tokens, name) {
    return tokens.replace('<|im_sep|>', ':' + name + '<|im_sep|>');
}
const openaiGetHeaderTokensForMessage = async (message, tokenizer) => {
    if (tokenizer !== 'o200k_base' &&
        tokenizer !== 'cl100k_base' &&
        tokenizer !== 'cl100k_base_special_tokens') {
        throw new Error(`Invalid tokenizer: ${tokenizer}. Only o200k_base, cl100k_base, and cl100k_base_special_tokens tokenizers are supported.`);
    }
    let headerTokens;
    switch (message.role) {
        case 'system':
            headerTokens =
                tokenizer === 'o200k_base' ? O200K_SYSTEM_TOKENS : CL100K_SYSTEM_TOKENS;
            break;
        case 'user':
            headerTokens =
                tokenizer === 'o200k_base' ? O200K_USER_TOKENS : CL100K_USER_TOKENS;
            break;
        case 'assistant':
            headerTokens =
                tokenizer === 'o200k_base'
                    ? O200K_ASSISTANT_TOKENS
                    : CL100K_ASSISTANT_TOKENS;
            break;
        case 'tool':
            headerTokens =
                tokenizer === 'o200k_base' ? O200K_TOOL_TOKENS : CL100K_TOOL_TOKENS;
            break;
        default:
            throw new Error(`Unknown role ${message.role}`);
    }
    if ('name' in message && message.name !== undefined) {
        headerTokens = await injectName(headerTokens, message.name, tokenizer);
    }
    if ('to' in message && message.to !== undefined) {
        headerTokens = await injectTo(headerTokens, message.to, tokenizer);
    }
    return headerTokens;
};
exports.openaiGetHeaderTokensForMessage = openaiGetHeaderTokensForMessage;
const openaiGetHeaderStringForMessage = (message, tokenizer) => {
    if (tokenizer !== 'o200k_base' &&
        tokenizer !== 'cl100k_base' &&
        tokenizer !== 'cl100k_base_special_tokens' &&
        tokenizer !== 'o200k_base_special_tokens') {
        throw new Error(`Invalid tokenizer: ${tokenizer}. Only o200k_base, cl100k_base, and cl100k_base_special_tokens tokenizers are supported.`);
    }
    let headerString = '';
    switch (message.role) {
        case 'system':
            headerString =
                tokenizer === 'o200k_base' || tokenizer === 'o200k_base_special_tokens'
                    ? O200K_SYSTEM_TOKENS_STRING
                    : CL100K_SYSTEM_TOKENS_STRING;
            break;
        case 'user':
            headerString =
                tokenizer === 'o200k_base' || tokenizer === 'o200k_base_special_tokens'
                    ? O200K_USER_TOKENS_STRING
                    : CL100K_USER_TOKENS_STRING;
            break;
        case 'assistant':
            headerString =
                tokenizer === 'o200k_base' || tokenizer === 'o200k_base_special_tokens'
                    ? O200K_ASSISTANT_TOKENS_STRING
                    : CL100K_ASSISTANT_TOKENS_STRING;
            break;
        case 'tool':
            headerString =
                tokenizer === 'o200k_base' || tokenizer === 'o200k_base_special_tokens'
                    ? O200K_TOOL_TOKENS_STRING
                    : CL100K_USER_TOKENS_STRING;
            break;
        default:
            throw new Error(`Unknown role ${message.role}`);
    }
    if ('name' in message && message.name !== undefined) {
        headerString = injectNameString(headerString, message.name);
    }
    return headerString;
};
exports.openaiGetHeaderStringForMessage = openaiGetHeaderStringForMessage;
const applyCodestralChatTemplate = (messages, options) => {
    let chatTemplate = CODESTRAL_BOS_TOKEN;
    if (messages[0].role === 'system') {
        chatTemplate += `[INST] <<SYS>>\n${messages[0].content}\n<</SYS>>\n\n`;
        if (messages[1].role !== 'user') {
            throw new Error('Second message must be a user message if first is system');
        }
        else {
            chatTemplate += `${messages[1].content} [/INST]`;
        }
        if (messages.length > 3) {
            throw new Error('Too many messages');
        }
        else if (messages.length === 3) {
            if (messages[2].role === 'assistant') {
                chatTemplate += messages[2].content;
            }
            else {
                throw new Error('Third message with system prompt must be an assistant message');
            }
        }
    }
    else if (messages[0].role === 'user') {
        chatTemplate += `[INST] ${messages[0].content} [/INST]`;
        if (messages.length > 2) {
            throw new Error('Too many messages');
        }
        else if (messages.length === 2) {
            if (messages[1].role === 'assistant') {
                chatTemplate += messages[1].content;
            }
            else {
                throw new Error('Second message with user prompt must be an assistant message');
            }
        }
    }
    else {
        throw new Error('First message must be a system message or a user message');
    }
    return chatTemplate;
};
exports.applyCodestralChatTemplate = applyCodestralChatTemplate;
