base.js 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. import { v5 as uuidv5 } from "uuid";
  2. import { UUIDV5_NAMESPACE } from "./record_manager.js";
  3. import { insecureHash } from "../utils/hash.js";
  4. import { Document } from "../documents/document.js";
  5. /**
  6. * HashedDocument is a Document with hashes calculated.
  7. * Hashes are calculated based on page content and metadata.
  8. * It is used for indexing.
  9. */
  10. export class _HashedDocument {
  11. constructor(fields) {
  12. Object.defineProperty(this, "uid", {
  13. enumerable: true,
  14. configurable: true,
  15. writable: true,
  16. value: void 0
  17. });
  18. Object.defineProperty(this, "hash_", {
  19. enumerable: true,
  20. configurable: true,
  21. writable: true,
  22. value: void 0
  23. });
  24. Object.defineProperty(this, "contentHash", {
  25. enumerable: true,
  26. configurable: true,
  27. writable: true,
  28. value: void 0
  29. });
  30. Object.defineProperty(this, "metadataHash", {
  31. enumerable: true,
  32. configurable: true,
  33. writable: true,
  34. value: void 0
  35. });
  36. Object.defineProperty(this, "pageContent", {
  37. enumerable: true,
  38. configurable: true,
  39. writable: true,
  40. value: void 0
  41. });
  42. Object.defineProperty(this, "metadata", {
  43. enumerable: true,
  44. configurable: true,
  45. writable: true,
  46. value: void 0
  47. });
  48. this.uid = fields.uid;
  49. this.pageContent = fields.pageContent;
  50. this.metadata = fields.metadata;
  51. }
  52. calculateHashes() {
  53. const forbiddenKeys = ["hash_", "content_hash", "metadata_hash"];
  54. for (const key of forbiddenKeys) {
  55. if (key in this.metadata) {
  56. throw new Error(`Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(", ")}]`);
  57. }
  58. }
  59. const contentHash = this._hashStringToUUID(this.pageContent);
  60. try {
  61. const metadataHash = this._hashNestedDictToUUID(this.metadata);
  62. this.contentHash = contentHash;
  63. this.metadataHash = metadataHash;
  64. }
  65. catch (e) {
  66. throw new Error(`Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`);
  67. }
  68. this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);
  69. if (!this.uid) {
  70. this.uid = this.hash_;
  71. }
  72. }
  73. toDocument() {
  74. return new Document({
  75. pageContent: this.pageContent,
  76. metadata: this.metadata,
  77. });
  78. }
  79. static fromDocument(document, uid) {
  80. const doc = new this({
  81. pageContent: document.pageContent,
  82. metadata: document.metadata,
  83. uid: uid || document.uid,
  84. });
  85. doc.calculateHashes();
  86. return doc;
  87. }
  88. _hashStringToUUID(inputString) {
  89. const hash_value = insecureHash(inputString);
  90. return uuidv5(hash_value, UUIDV5_NAMESPACE);
  91. }
  92. _hashNestedDictToUUID(data) {
  93. const serialized_data = JSON.stringify(data, Object.keys(data).sort());
  94. const hash_value = insecureHash(serialized_data);
  95. return uuidv5(hash_value, UUIDV5_NAMESPACE);
  96. }
  97. }
  98. export function _batch(size, iterable) {
  99. const batches = [];
  100. let currentBatch = [];
  101. iterable.forEach((item) => {
  102. currentBatch.push(item);
  103. if (currentBatch.length >= size) {
  104. batches.push(currentBatch);
  105. currentBatch = [];
  106. }
  107. });
  108. if (currentBatch.length > 0) {
  109. batches.push(currentBatch);
  110. }
  111. return batches;
  112. }
  113. export function _deduplicateInOrder(hashedDocuments) {
  114. const seen = new Set();
  115. const deduplicated = [];
  116. for (const hashedDoc of hashedDocuments) {
  117. if (!hashedDoc.hash_) {
  118. throw new Error("Hashed document does not have a hash");
  119. }
  120. if (!seen.has(hashedDoc.hash_)) {
  121. seen.add(hashedDoc.hash_);
  122. deduplicated.push(hashedDoc);
  123. }
  124. }
  125. return deduplicated;
  126. }
  127. export function _getSourceIdAssigner(sourceIdKey) {
  128. if (sourceIdKey === null) {
  129. return (_doc) => null;
  130. }
  131. else if (typeof sourceIdKey === "string") {
  132. return (doc) => doc.metadata[sourceIdKey];
  133. }
  134. else if (typeof sourceIdKey === "function") {
  135. return sourceIdKey;
  136. }
  137. else {
  138. throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
  139. }
  140. }
  141. // eslint-disable-next-line @typescript-eslint/no-explicit-any
  142. export const _isBaseDocumentLoader = (arg) => {
  143. if ("load" in arg &&
  144. typeof arg.load === "function" &&
  145. "loadAndSplit" in arg &&
  146. typeof arg.loadAndSplit === "function") {
  147. return true;
  148. }
  149. return false;
  150. };
  151. /**
  152. * Index data from the doc source into the vector store.
  153. *
  154. * Indexing functionality uses a manager to keep track of which documents
  155. * are in the vector store.
  156. *
  157. * This allows us to keep track of which documents were updated, and which
  158. * documents were deleted, which documents should be skipped.
  159. *
  160. * For the time being, documents are indexed using their hashes, and users
  161. * are not able to specify the uid of the document.
  162. *
  163. * @param {IndexArgs} args
  164. * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.
  165. * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.
  166. * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.
  167. * @param {IndexOptions | undefined} args.options Options for indexing.
  168. * @returns {Promise<IndexingResult>}
  169. */
  170. export async function index(args) {
  171. const { docsSource, recordManager, vectorStore, options } = args;
  172. const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, } = options ?? {};
  173. if (cleanup === "incremental" && !sourceIdKey) {
  174. throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
  175. }
  176. const docs = _isBaseDocumentLoader(docsSource)
  177. ? await docsSource.load()
  178. : docsSource;
  179. const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
  180. const indexStartDt = await recordManager.getTime();
  181. let numAdded = 0;
  182. let numDeleted = 0;
  183. let numUpdated = 0;
  184. let numSkipped = 0;
  185. const batches = _batch(batchSize ?? 100, docs);
  186. for (const batch of batches) {
  187. const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
  188. const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
  189. if (cleanup === "incremental") {
  190. hashedDocs.forEach((_hashedDoc, index) => {
  191. const source = sourceIds[index];
  192. if (source === null) {
  193. throw new Error("sourceIdKey must be provided when cleanup is incremental");
  194. }
  195. });
  196. }
  197. const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid));
  198. const uids = [];
  199. const docsToIndex = [];
  200. const docsToUpdate = [];
  201. const seenDocs = new Set();
  202. hashedDocs.forEach((hashedDoc, i) => {
  203. const docExists = batchExists[i];
  204. if (docExists) {
  205. if (forceUpdate) {
  206. seenDocs.add(hashedDoc.uid);
  207. }
  208. else {
  209. docsToUpdate.push(hashedDoc.uid);
  210. return;
  211. }
  212. }
  213. uids.push(hashedDoc.uid);
  214. docsToIndex.push(hashedDoc.toDocument());
  215. });
  216. if (docsToUpdate.length > 0) {
  217. await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });
  218. numSkipped += docsToUpdate.length;
  219. }
  220. if (docsToIndex.length > 0) {
  221. await vectorStore.addDocuments(docsToIndex, { ids: uids });
  222. numAdded += docsToIndex.length - seenDocs.size;
  223. numUpdated += seenDocs.size;
  224. }
  225. await recordManager.update(hashedDocs.map((doc) => doc.uid), { timeAtLeast: indexStartDt, groupIds: sourceIds });
  226. if (cleanup === "incremental") {
  227. sourceIds.forEach((sourceId) => {
  228. if (!sourceId)
  229. throw new Error("Source id cannot be null");
  230. });
  231. const uidsToDelete = await recordManager.listKeys({
  232. before: indexStartDt,
  233. groupIds: sourceIds,
  234. });
  235. if (uidsToDelete.length > 0) {
  236. await vectorStore.delete({ ids: uidsToDelete });
  237. await recordManager.deleteKeys(uidsToDelete);
  238. numDeleted += uidsToDelete.length;
  239. }
  240. }
  241. }
  242. if (cleanup === "full") {
  243. let uidsToDelete = await recordManager.listKeys({
  244. before: indexStartDt,
  245. limit: cleanupBatchSize,
  246. });
  247. while (uidsToDelete.length > 0) {
  248. await vectorStore.delete({ ids: uidsToDelete });
  249. await recordManager.deleteKeys(uidsToDelete);
  250. numDeleted += uidsToDelete.length;
  251. uidsToDelete = await recordManager.listKeys({
  252. before: indexStartDt,
  253. limit: cleanupBatchSize,
  254. });
  255. }
  256. }
  257. return {
  258. numAdded,
  259. numDeleted,
  260. numUpdated,
  261. numSkipped,
  262. };
  263. }