"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.index = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports._HashedDocument = void 0; const uuid_1 = require("uuid"); const record_manager_js_1 = require("./record_manager.cjs"); const hash_js_1 = require("../utils/hash.cjs"); const document_js_1 = require("../documents/document.cjs"); /** * HashedDocument is a Document with hashes calculated. * Hashes are calculated based on page content and metadata. * It is used for indexing. */ class _HashedDocument { constructor(fields) { Object.defineProperty(this, "uid", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "hash_", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "contentHash", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "metadataHash", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "pageContent", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "metadata", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.uid = fields.uid; this.pageContent = fields.pageContent; this.metadata = fields.metadata; } calculateHashes() { const forbiddenKeys = ["hash_", "content_hash", "metadata_hash"]; for (const key of forbiddenKeys) { if (key in this.metadata) { throw new Error(`Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(", ")}]`); } } const contentHash = this._hashStringToUUID(this.pageContent); try { const metadataHash = this._hashNestedDictToUUID(this.metadata); this.contentHash = contentHash; this.metadataHash = metadataHash; } catch (e) { throw new Error(`Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`); } this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash); if (!this.uid) { this.uid = this.hash_; } } toDocument() { return new document_js_1.Document({ pageContent: this.pageContent, metadata: this.metadata, }); } static fromDocument(document, uid) { const doc = new this({ pageContent: document.pageContent, metadata: document.metadata, uid: uid || document.uid, }); doc.calculateHashes(); return doc; } _hashStringToUUID(inputString) { const hash_value = (0, hash_js_1.insecureHash)(inputString); return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE); } _hashNestedDictToUUID(data) { const serialized_data = JSON.stringify(data, Object.keys(data).sort()); const hash_value = (0, hash_js_1.insecureHash)(serialized_data); return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE); } } exports._HashedDocument = _HashedDocument; function _batch(size, iterable) { const batches = []; let currentBatch = []; iterable.forEach((item) => { currentBatch.push(item); if (currentBatch.length >= size) { batches.push(currentBatch); currentBatch = []; } }); if (currentBatch.length > 0) { batches.push(currentBatch); } return batches; } exports._batch = _batch; function _deduplicateInOrder(hashedDocuments) { const seen = new Set(); const deduplicated = []; for (const hashedDoc of hashedDocuments) { if (!hashedDoc.hash_) { throw new Error("Hashed document does not have a hash"); } if (!seen.has(hashedDoc.hash_)) { seen.add(hashedDoc.hash_); deduplicated.push(hashedDoc); } } return deduplicated; } exports._deduplicateInOrder = _deduplicateInOrder; function _getSourceIdAssigner(sourceIdKey) { if (sourceIdKey === null) { return (_doc) => null; } else if (typeof sourceIdKey === "string") { return (doc) => doc.metadata[sourceIdKey]; } else if (typeof sourceIdKey === "function") { return sourceIdKey; } else { throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`); } } exports._getSourceIdAssigner = _getSourceIdAssigner; // eslint-disable-next-line @typescript-eslint/no-explicit-any const _isBaseDocumentLoader = (arg) => { if ("load" in arg && typeof arg.load === "function" && "loadAndSplit" in arg && typeof arg.loadAndSplit === "function") { return true; } return false; }; exports._isBaseDocumentLoader = _isBaseDocumentLoader; /** * Index data from the doc source into the vector store. * * Indexing functionality uses a manager to keep track of which documents * are in the vector store. * * This allows us to keep track of which documents were updated, and which * documents were deleted, which documents should be skipped. * * For the time being, documents are indexed using their hashes, and users * are not able to specify the uid of the document. * * @param {IndexArgs} args * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents. * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents. * @param {VectorStore} args.vectorStore The vector store to use for storing the documents. * @param {IndexOptions | undefined} args.options Options for indexing. * @returns {Promise} */ async function index(args) { const { docsSource, recordManager, vectorStore, options } = args; const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, } = options ?? {}; if (cleanup === "incremental" && !sourceIdKey) { throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'."); } const docs = (0, exports._isBaseDocumentLoader)(docsSource) ? await docsSource.load() : docsSource; const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null); const indexStartDt = await recordManager.getTime(); let numAdded = 0; let numDeleted = 0; let numUpdated = 0; let numSkipped = 0; const batches = _batch(batchSize ?? 100, docs); for (const batch of batches) { const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc))); const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc)); if (cleanup === "incremental") { hashedDocs.forEach((_hashedDoc, index) => { const source = sourceIds[index]; if (source === null) { throw new Error("sourceIdKey must be provided when cleanup is incremental"); } }); } const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid)); const uids = []; const docsToIndex = []; const docsToUpdate = []; const seenDocs = new Set(); hashedDocs.forEach((hashedDoc, i) => { const docExists = batchExists[i]; if (docExists) { if (forceUpdate) { seenDocs.add(hashedDoc.uid); } else { docsToUpdate.push(hashedDoc.uid); return; } } uids.push(hashedDoc.uid); docsToIndex.push(hashedDoc.toDocument()); }); if (docsToUpdate.length > 0) { await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt }); numSkipped += docsToUpdate.length; } if (docsToIndex.length > 0) { await vectorStore.addDocuments(docsToIndex, { ids: uids }); numAdded += docsToIndex.length - seenDocs.size; numUpdated += seenDocs.size; } await recordManager.update(hashedDocs.map((doc) => doc.uid), { timeAtLeast: indexStartDt, groupIds: sourceIds }); if (cleanup === "incremental") { sourceIds.forEach((sourceId) => { if (!sourceId) throw new Error("Source id cannot be null"); }); const uidsToDelete = await recordManager.listKeys({ before: indexStartDt, groupIds: sourceIds, }); if (uidsToDelete.length > 0) { await vectorStore.delete({ ids: uidsToDelete }); await recordManager.deleteKeys(uidsToDelete); numDeleted += uidsToDelete.length; } } } if (cleanup === "full") { let uidsToDelete = await recordManager.listKeys({ before: indexStartDt, limit: cleanupBatchSize, }); while (uidsToDelete.length > 0) { await vectorStore.delete({ ids: uidsToDelete }); await recordManager.deleteKeys(uidsToDelete); numDeleted += uidsToDelete.length; uidsToDelete = await recordManager.listKeys({ before: indexStartDt, limit: cleanupBatchSize, }); } } return { numAdded, numDeleted, numUpdated, numSkipped, }; } exports.index = index;