"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.index = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports._HashedDocument = void 0;
const uuid_1 = require("uuid");
const record_manager_js_1 = require("./record_manager.cjs");
const hash_js_1 = require("../utils/hash.cjs");
const document_js_1 = require("../documents/document.cjs");
/**
 * HashedDocument is a Document with hashes calculated.
 * Hashes are calculated based on page content and metadata.
 * It is used for indexing.
 */
class _HashedDocument {
    constructor(fields) {
        Object.defineProperty(this, "uid", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "hash_", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "contentHash", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "metadataHash", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "pageContent", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "metadata", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        this.uid = fields.uid;
        this.pageContent = fields.pageContent;
        this.metadata = fields.metadata;
    }
    calculateHashes() {
        const forbiddenKeys = ["hash_", "content_hash", "metadata_hash"];
        for (const key of forbiddenKeys) {
            if (key in this.metadata) {
                throw new Error(`Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(", ")}]`);
            }
        }
        const contentHash = this._hashStringToUUID(this.pageContent);
        try {
            const metadataHash = this._hashNestedDictToUUID(this.metadata);
            this.contentHash = contentHash;
            this.metadataHash = metadataHash;
        }
        catch (e) {
            throw new Error(`Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`);
        }
        this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);
        if (!this.uid) {
            this.uid = this.hash_;
        }
    }
    toDocument() {
        return new document_js_1.Document({
            pageContent: this.pageContent,
            metadata: this.metadata,
        });
    }
    static fromDocument(document, uid) {
        const doc = new this({
            pageContent: document.pageContent,
            metadata: document.metadata,
            uid: uid || document.uid,
        });
        doc.calculateHashes();
        return doc;
    }
    _hashStringToUUID(inputString) {
        const hash_value = (0, hash_js_1.insecureHash)(inputString);
        return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE);
    }
    _hashNestedDictToUUID(data) {
        const serialized_data = JSON.stringify(data, Object.keys(data).sort());
        const hash_value = (0, hash_js_1.insecureHash)(serialized_data);
        return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE);
    }
}
exports._HashedDocument = _HashedDocument;
function _batch(size, iterable) {
    const batches = [];
    let currentBatch = [];
    iterable.forEach((item) => {
        currentBatch.push(item);
        if (currentBatch.length >= size) {
            batches.push(currentBatch);
            currentBatch = [];
        }
    });
    if (currentBatch.length > 0) {
        batches.push(currentBatch);
    }
    return batches;
}
exports._batch = _batch;
function _deduplicateInOrder(hashedDocuments) {
    const seen = new Set();
    const deduplicated = [];
    for (const hashedDoc of hashedDocuments) {
        if (!hashedDoc.hash_) {
            throw new Error("Hashed document does not have a hash");
        }
        if (!seen.has(hashedDoc.hash_)) {
            seen.add(hashedDoc.hash_);
            deduplicated.push(hashedDoc);
        }
    }
    return deduplicated;
}
exports._deduplicateInOrder = _deduplicateInOrder;
function _getSourceIdAssigner(sourceIdKey) {
    if (sourceIdKey === null) {
        return (_doc) => null;
    }
    else if (typeof sourceIdKey === "string") {
        return (doc) => doc.metadata[sourceIdKey];
    }
    else if (typeof sourceIdKey === "function") {
        return sourceIdKey;
    }
    else {
        throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
    }
}
exports._getSourceIdAssigner = _getSourceIdAssigner;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const _isBaseDocumentLoader = (arg) => {
    if ("load" in arg &&
        typeof arg.load === "function" &&
        "loadAndSplit" in arg &&
        typeof arg.loadAndSplit === "function") {
        return true;
    }
    return false;
};
exports._isBaseDocumentLoader = _isBaseDocumentLoader;
/**
 * Index data from the doc source into the vector store.
 *
 * Indexing functionality uses a manager to keep track of which documents
 * are in the vector store.
 *
 * This allows us to keep track of which documents were updated, and which
 * documents were deleted, which documents should be skipped.
 *
 * For the time being, documents are indexed using their hashes, and users
 *  are not able to specify the uid of the document.
 *
 * @param {IndexArgs} args
 * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.
 * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.
 * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.
 * @param {IndexOptions | undefined} args.options Options for indexing.
 * @returns {Promise<IndexingResult>}
 */
async function index(args) {
    const { docsSource, recordManager, vectorStore, options } = args;
    const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, } = options ?? {};
    if (cleanup === "incremental" && !sourceIdKey) {
        throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
    }
    const docs = (0, exports._isBaseDocumentLoader)(docsSource)
        ? await docsSource.load()
        : docsSource;
    const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
    const indexStartDt = await recordManager.getTime();
    let numAdded = 0;
    let numDeleted = 0;
    let numUpdated = 0;
    let numSkipped = 0;
    const batches = _batch(batchSize ?? 100, docs);
    for (const batch of batches) {
        const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
        const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
        if (cleanup === "incremental") {
            hashedDocs.forEach((_hashedDoc, index) => {
                const source = sourceIds[index];
                if (source === null) {
                    throw new Error("sourceIdKey must be provided when cleanup is incremental");
                }
            });
        }
        const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid));
        const uids = [];
        const docsToIndex = [];
        const docsToUpdate = [];
        const seenDocs = new Set();
        hashedDocs.forEach((hashedDoc, i) => {
            const docExists = batchExists[i];
            if (docExists) {
                if (forceUpdate) {
                    seenDocs.add(hashedDoc.uid);
                }
                else {
                    docsToUpdate.push(hashedDoc.uid);
                    return;
                }
            }
            uids.push(hashedDoc.uid);
            docsToIndex.push(hashedDoc.toDocument());
        });
        if (docsToUpdate.length > 0) {
            await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });
            numSkipped += docsToUpdate.length;
        }
        if (docsToIndex.length > 0) {
            await vectorStore.addDocuments(docsToIndex, { ids: uids });
            numAdded += docsToIndex.length - seenDocs.size;
            numUpdated += seenDocs.size;
        }
        await recordManager.update(hashedDocs.map((doc) => doc.uid), { timeAtLeast: indexStartDt, groupIds: sourceIds });
        if (cleanup === "incremental") {
            sourceIds.forEach((sourceId) => {
                if (!sourceId)
                    throw new Error("Source id cannot be null");
            });
            const uidsToDelete = await recordManager.listKeys({
                before: indexStartDt,
                groupIds: sourceIds,
            });
            if (uidsToDelete.length > 0) {
                await vectorStore.delete({ ids: uidsToDelete });
                await recordManager.deleteKeys(uidsToDelete);
                numDeleted += uidsToDelete.length;
            }
        }
    }
    if (cleanup === "full") {
        let uidsToDelete = await recordManager.listKeys({
            before: indexStartDt,
            limit: cleanupBatchSize,
        });
        while (uidsToDelete.length > 0) {
            await vectorStore.delete({ ids: uidsToDelete });
            await recordManager.deleteKeys(uidsToDelete);
            numDeleted += uidsToDelete.length;
            uidsToDelete = await recordManager.listKeys({
                before: indexStartDt,
                limit: cleanupBatchSize,
            });
        }
    }
    return {
        numAdded,
        numDeleted,
        numUpdated,
        numSkipped,
    };
}
exports.index = index;