base.d.ts 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import { VectorStore } from "../vectorstores.js";
  2. import { RecordManagerInterface } from "./record_manager.js";
  3. import { DocumentInterface } from "../documents/document.js";
  4. import { BaseDocumentLoader } from "../document_loaders/base.js";
  5. type Metadata = Record<string, unknown>;
  6. type IndexingResult = {
  7. numAdded: number;
  8. numDeleted: number;
  9. numUpdated: number;
  10. numSkipped: number;
  11. };
  12. type StringOrDocFunc = string | ((doc: DocumentInterface) => string);
  13. export interface HashedDocumentInterface extends DocumentInterface {
  14. uid: string;
  15. hash_?: string;
  16. contentHash?: string;
  17. metadataHash?: string;
  18. pageContent: string;
  19. metadata: Metadata;
  20. calculateHashes(): void;
  21. toDocument(): DocumentInterface;
  22. }
  23. interface HashedDocumentArgs {
  24. pageContent: string;
  25. metadata: Metadata;
  26. uid: string;
  27. }
  28. /**
  29. * HashedDocument is a Document with hashes calculated.
  30. * Hashes are calculated based on page content and metadata.
  31. * It is used for indexing.
  32. */
  33. export declare class _HashedDocument implements HashedDocumentInterface {
  34. uid: string;
  35. hash_?: string;
  36. contentHash?: string;
  37. metadataHash?: string;
  38. pageContent: string;
  39. metadata: Metadata;
  40. constructor(fields: HashedDocumentArgs);
  41. calculateHashes(): void;
  42. toDocument(): DocumentInterface;
  43. static fromDocument(document: DocumentInterface, uid?: string): _HashedDocument;
  44. private _hashStringToUUID;
  45. private _hashNestedDictToUUID;
  46. }
  47. export type CleanupMode = "full" | "incremental";
  48. export type IndexOptions = {
  49. /**
  50. * The number of documents to index in one batch.
  51. */
  52. batchSize?: number;
  53. /**
  54. * The cleanup mode to use. Can be "full", "incremental" or undefined.
  55. * - **Incremental**: Cleans up all documents that haven't been updated AND
  56. * that are associated with source ids that were seen
  57. * during indexing.
  58. * Clean up is done continuously during indexing helping
  59. * to minimize the probability of users seeing duplicated
  60. * content.
  61. * - **Full**: Delete all documents that haven to been returned by the loader.
  62. * Clean up runs after all documents have been indexed.
  63. * This means that users may see duplicated content during indexing.
  64. * - **undefined**: Do not delete any documents.
  65. */
  66. cleanup?: CleanupMode;
  67. /**
  68. * Optional key that helps identify the original source of the document.
  69. * Must either be a string representing the key of the source in the metadata
  70. * or a function that takes a document and returns a string representing the source.
  71. * **Required when cleanup is incremental**.
  72. */
  73. sourceIdKey?: StringOrDocFunc;
  74. /**
  75. * Batch size to use when cleaning up documents.
  76. */
  77. cleanupBatchSize?: number;
  78. /**
  79. * Force update documents even if they are present in the
  80. * record manager. Useful if you are re-indexing with updated embeddings.
  81. */
  82. forceUpdate?: boolean;
  83. };
  84. export declare function _batch<T>(size: number, iterable: T[]): T[][];
  85. export declare function _deduplicateInOrder(hashedDocuments: HashedDocumentInterface[]): HashedDocumentInterface[];
  86. export declare function _getSourceIdAssigner(sourceIdKey: StringOrDocFunc | null): (doc: DocumentInterface) => string | null;
  87. export declare const _isBaseDocumentLoader: (arg: any) => arg is BaseDocumentLoader;
  88. interface IndexArgs {
  89. docsSource: BaseDocumentLoader | DocumentInterface[];
  90. recordManager: RecordManagerInterface;
  91. vectorStore: VectorStore;
  92. options?: IndexOptions;
  93. }
  94. /**
  95. * Index data from the doc source into the vector store.
  96. *
  97. * Indexing functionality uses a manager to keep track of which documents
  98. * are in the vector store.
  99. *
  100. * This allows us to keep track of which documents were updated, and which
  101. * documents were deleted, which documents should be skipped.
  102. *
  103. * For the time being, documents are indexed using their hashes, and users
  104. * are not able to specify the uid of the document.
  105. *
  106. * @param {IndexArgs} args
  107. * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.
  108. * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.
  109. * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.
  110. * @param {IndexOptions | undefined} args.options Options for indexing.
  111. * @returns {Promise<IndexingResult>}
  112. */
  113. export declare function index(args: IndexArgs): Promise<IndexingResult>;
  114. export {};