123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- from transformers import AutoModel, AutoTokenizer
- import os
- import torch
- import faiss
- import numpy as np
- import pickle
- import re
- from sklearn.preprocessing import normalize
- # 指定本地模型路径和资源路径
- model_path = r"D:/STUDY/Project/jizhouyao/RAG/models/all-MiniLM-L6-v2"# 本地存储的预训练模型路径
- resources_path = r"D:/STUDY/Project/jizhouyao/RAG/resources"# 文本文件存储路径
- faiss_index_path = r"D:/STUDY/Project/jizhouyao/RAG/index/vector_index.faiss"# FAISS 索引的保存路径
- texts_pickle_path = r"D:/STUDY/Project/jizhouyao/RAG/index/texts.pkl"# 文本数据保存路径
- # 检查路径是否存在
- if not os.path.exists(model_path):
- raise FileNotFoundError(f"模型路径不存在: {model_path}")
- if not os.path.exists(resources_path):
- raise FileNotFoundError(f"资源文件夹不存在: {resources_path}")
- # 加载模型和分词器
- tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
- model = AutoModel.from_pretrained(model_path, local_files_only=True)
- # 文本存储
- all_texts = [] # 存储所有文本内容
- # 分句函数:使用正则表达式分割中文句子
- def split_sentences(text):
- sentences = re.split(r'[' '。!?;::\n]', text) # 基于标点符号分割
- return [sentence.strip() for sentence in sentences if sentence.strip()]
- # 向量化文本并动态选择索引类型
- def vectorize_and_store(texts):
- """
- 将文本分句并进行向量化,存入 FAISS 索引(句子级别)
- """
- print("开始向量化文本句子...")
- embeddings = [] # 存储向量
- global all_sentences # 存储所有句子及其元数据
- all_sentences = [] # 重置存储
- # 遍历所有文本文件
- for idx, text_data in enumerate(texts):
- file_name = text_data["name"] # 文件名
- content = text_data["content"] # 文件内容
- # 分句
- sentences = split_sentences(content) # 使用分句函数拆分文本
- for sentence in sentences:
- if not sentence.strip(): # 跳过空句子
- continue
- # 向量化每个句子
- inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
- with torch.no_grad():
- outputs = model(**inputs)
- cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
- # 归一化句子向量
- cls_embedding = normalize(cls_embedding.reshape(1, -1)).flatten()
- embeddings.append(cls_embedding)
- # 存储句子及其来源元数据
- all_sentences.append({
- "file": file_name,
- "sentence": sentence
- })
- # 将所有向量转换为 numpy 数组
- embeddings = np.array(embeddings)
- # 动态选择 FAISS 索引类型
- if len(embeddings) < 10:
- print("数据量较小,使用 IndexFlatL2 进行索引...")
- index = faiss.IndexFlatL2(embeddings.shape[1]) # 无需训练的索引
- else:
- num_clusters = max(1, min(10, len(embeddings) // 10)) # 根据数据量动态设置聚类数
- print(f"数据量较大,使用 IVFFlat 索引,聚类数: {num_clusters}")
- quantizer = faiss.IndexFlatL2(embeddings.shape[1])
- index = faiss.IndexIVFFlat(quantizer, embeddings.shape[1], num_clusters, faiss.METRIC_L2)
- print("训练 IVFFlat 索引...")
- index.train(embeddings) # 训练索引
- # 添加向量到索引
- index.add(embeddings)
- print(f"已完成 {len(embeddings)} 条句子的向量化与索引!")
- return index
- # 保存索引和文本内容
- def save_index_and_texts(index):
- os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
- faiss.write_index(index, faiss_index_path)
- with open(texts_pickle_path, 'wb') as f:
- pickle.dump(all_texts, f)
- print("FAISS 索引和文本数据已保存!")
- # 读取资源文件
- def process_txt_files(resources_path):
- for filename in os.listdir(resources_path):
- if filename.endswith('.txt'):
- file_path = os.path.join(resources_path, filename)
- print(f"正在处理文件: {file_path}")
- with open(file_path, 'r', encoding='utf-8') as file:
- all_texts.append({"name": filename, "content": file.read()})
- # 主函数
- if __name__ == '__main__':
- process_txt_files(resources_path) # 读取文本数据
- index = vectorize_and_store(all_texts) # 向量化并构建索引
- save_index_and_texts(index) # 保存索引和文本数据
- print("索引构建完成!")
|