base.cjs 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", { value: true });
  3. exports.index = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports._HashedDocument = void 0;
  4. const uuid_1 = require("uuid");
  5. const record_manager_js_1 = require("./record_manager.cjs");
  6. const hash_js_1 = require("../utils/hash.cjs");
  7. const document_js_1 = require("../documents/document.cjs");
  8. /**
  9. * HashedDocument is a Document with hashes calculated.
  10. * Hashes are calculated based on page content and metadata.
  11. * It is used for indexing.
  12. */
  13. class _HashedDocument {
  14. constructor(fields) {
  15. Object.defineProperty(this, "uid", {
  16. enumerable: true,
  17. configurable: true,
  18. writable: true,
  19. value: void 0
  20. });
  21. Object.defineProperty(this, "hash_", {
  22. enumerable: true,
  23. configurable: true,
  24. writable: true,
  25. value: void 0
  26. });
  27. Object.defineProperty(this, "contentHash", {
  28. enumerable: true,
  29. configurable: true,
  30. writable: true,
  31. value: void 0
  32. });
  33. Object.defineProperty(this, "metadataHash", {
  34. enumerable: true,
  35. configurable: true,
  36. writable: true,
  37. value: void 0
  38. });
  39. Object.defineProperty(this, "pageContent", {
  40. enumerable: true,
  41. configurable: true,
  42. writable: true,
  43. value: void 0
  44. });
  45. Object.defineProperty(this, "metadata", {
  46. enumerable: true,
  47. configurable: true,
  48. writable: true,
  49. value: void 0
  50. });
  51. this.uid = fields.uid;
  52. this.pageContent = fields.pageContent;
  53. this.metadata = fields.metadata;
  54. }
  55. calculateHashes() {
  56. const forbiddenKeys = ["hash_", "content_hash", "metadata_hash"];
  57. for (const key of forbiddenKeys) {
  58. if (key in this.metadata) {
  59. throw new Error(`Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(", ")}]`);
  60. }
  61. }
  62. const contentHash = this._hashStringToUUID(this.pageContent);
  63. try {
  64. const metadataHash = this._hashNestedDictToUUID(this.metadata);
  65. this.contentHash = contentHash;
  66. this.metadataHash = metadataHash;
  67. }
  68. catch (e) {
  69. throw new Error(`Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`);
  70. }
  71. this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);
  72. if (!this.uid) {
  73. this.uid = this.hash_;
  74. }
  75. }
  76. toDocument() {
  77. return new document_js_1.Document({
  78. pageContent: this.pageContent,
  79. metadata: this.metadata,
  80. });
  81. }
  82. static fromDocument(document, uid) {
  83. const doc = new this({
  84. pageContent: document.pageContent,
  85. metadata: document.metadata,
  86. uid: uid || document.uid,
  87. });
  88. doc.calculateHashes();
  89. return doc;
  90. }
  91. _hashStringToUUID(inputString) {
  92. const hash_value = (0, hash_js_1.insecureHash)(inputString);
  93. return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE);
  94. }
  95. _hashNestedDictToUUID(data) {
  96. const serialized_data = JSON.stringify(data, Object.keys(data).sort());
  97. const hash_value = (0, hash_js_1.insecureHash)(serialized_data);
  98. return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE);
  99. }
  100. }
  101. exports._HashedDocument = _HashedDocument;
  102. function _batch(size, iterable) {
  103. const batches = [];
  104. let currentBatch = [];
  105. iterable.forEach((item) => {
  106. currentBatch.push(item);
  107. if (currentBatch.length >= size) {
  108. batches.push(currentBatch);
  109. currentBatch = [];
  110. }
  111. });
  112. if (currentBatch.length > 0) {
  113. batches.push(currentBatch);
  114. }
  115. return batches;
  116. }
  117. exports._batch = _batch;
  118. function _deduplicateInOrder(hashedDocuments) {
  119. const seen = new Set();
  120. const deduplicated = [];
  121. for (const hashedDoc of hashedDocuments) {
  122. if (!hashedDoc.hash_) {
  123. throw new Error("Hashed document does not have a hash");
  124. }
  125. if (!seen.has(hashedDoc.hash_)) {
  126. seen.add(hashedDoc.hash_);
  127. deduplicated.push(hashedDoc);
  128. }
  129. }
  130. return deduplicated;
  131. }
  132. exports._deduplicateInOrder = _deduplicateInOrder;
  133. function _getSourceIdAssigner(sourceIdKey) {
  134. if (sourceIdKey === null) {
  135. return (_doc) => null;
  136. }
  137. else if (typeof sourceIdKey === "string") {
  138. return (doc) => doc.metadata[sourceIdKey];
  139. }
  140. else if (typeof sourceIdKey === "function") {
  141. return sourceIdKey;
  142. }
  143. else {
  144. throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
  145. }
  146. }
  147. exports._getSourceIdAssigner = _getSourceIdAssigner;
  148. // eslint-disable-next-line @typescript-eslint/no-explicit-any
  149. const _isBaseDocumentLoader = (arg) => {
  150. if ("load" in arg &&
  151. typeof arg.load === "function" &&
  152. "loadAndSplit" in arg &&
  153. typeof arg.loadAndSplit === "function") {
  154. return true;
  155. }
  156. return false;
  157. };
  158. exports._isBaseDocumentLoader = _isBaseDocumentLoader;
  159. /**
  160. * Index data from the doc source into the vector store.
  161. *
  162. * Indexing functionality uses a manager to keep track of which documents
  163. * are in the vector store.
  164. *
  165. * This allows us to keep track of which documents were updated, and which
  166. * documents were deleted, which documents should be skipped.
  167. *
  168. * For the time being, documents are indexed using their hashes, and users
  169. * are not able to specify the uid of the document.
  170. *
  171. * @param {IndexArgs} args
  172. * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.
  173. * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.
  174. * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.
  175. * @param {IndexOptions | undefined} args.options Options for indexing.
  176. * @returns {Promise<IndexingResult>}
  177. */
  178. async function index(args) {
  179. const { docsSource, recordManager, vectorStore, options } = args;
  180. const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, } = options ?? {};
  181. if (cleanup === "incremental" && !sourceIdKey) {
  182. throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
  183. }
  184. const docs = (0, exports._isBaseDocumentLoader)(docsSource)
  185. ? await docsSource.load()
  186. : docsSource;
  187. const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
  188. const indexStartDt = await recordManager.getTime();
  189. let numAdded = 0;
  190. let numDeleted = 0;
  191. let numUpdated = 0;
  192. let numSkipped = 0;
  193. const batches = _batch(batchSize ?? 100, docs);
  194. for (const batch of batches) {
  195. const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
  196. const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
  197. if (cleanup === "incremental") {
  198. hashedDocs.forEach((_hashedDoc, index) => {
  199. const source = sourceIds[index];
  200. if (source === null) {
  201. throw new Error("sourceIdKey must be provided when cleanup is incremental");
  202. }
  203. });
  204. }
  205. const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid));
  206. const uids = [];
  207. const docsToIndex = [];
  208. const docsToUpdate = [];
  209. const seenDocs = new Set();
  210. hashedDocs.forEach((hashedDoc, i) => {
  211. const docExists = batchExists[i];
  212. if (docExists) {
  213. if (forceUpdate) {
  214. seenDocs.add(hashedDoc.uid);
  215. }
  216. else {
  217. docsToUpdate.push(hashedDoc.uid);
  218. return;
  219. }
  220. }
  221. uids.push(hashedDoc.uid);
  222. docsToIndex.push(hashedDoc.toDocument());
  223. });
  224. if (docsToUpdate.length > 0) {
  225. await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });
  226. numSkipped += docsToUpdate.length;
  227. }
  228. if (docsToIndex.length > 0) {
  229. await vectorStore.addDocuments(docsToIndex, { ids: uids });
  230. numAdded += docsToIndex.length - seenDocs.size;
  231. numUpdated += seenDocs.size;
  232. }
  233. await recordManager.update(hashedDocs.map((doc) => doc.uid), { timeAtLeast: indexStartDt, groupIds: sourceIds });
  234. if (cleanup === "incremental") {
  235. sourceIds.forEach((sourceId) => {
  236. if (!sourceId)
  237. throw new Error("Source id cannot be null");
  238. });
  239. const uidsToDelete = await recordManager.listKeys({
  240. before: indexStartDt,
  241. groupIds: sourceIds,
  242. });
  243. if (uidsToDelete.length > 0) {
  244. await vectorStore.delete({ ids: uidsToDelete });
  245. await recordManager.deleteKeys(uidsToDelete);
  246. numDeleted += uidsToDelete.length;
  247. }
  248. }
  249. }
  250. if (cleanup === "full") {
  251. let uidsToDelete = await recordManager.listKeys({
  252. before: indexStartDt,
  253. limit: cleanupBatchSize,
  254. });
  255. while (uidsToDelete.length > 0) {
  256. await vectorStore.delete({ ids: uidsToDelete });
  257. await recordManager.deleteKeys(uidsToDelete);
  258. numDeleted += uidsToDelete.length;
  259. uidsToDelete = await recordManager.listKeys({
  260. before: indexStartDt,
  261. limit: cleanupBatchSize,
  262. });
  263. }
  264. }
  265. return {
  266. numAdded,
  267. numDeleted,
  268. numUpdated,
  269. numSkipped,
  270. };
  271. }
  272. exports.index = index;