19047918096
/
s202226701010


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
							from transformers import AutoModel, AutoTokenizer
import os
import torch
import faiss
import numpy as np
import pickle
import re
from sklearn.preprocessing import normalize

# 指定本地模型路径和资源路径
model_path = r"D:/STUDY/Project/jizhouyao/RAG/models/all-MiniLM-L6-v2"# 本地存储的预训练模型路径
resources_path = r"D:/STUDY/Project/jizhouyao/RAG/resources"# 文本文件存储路径
faiss_index_path = r"D:/STUDY/Project/jizhouyao/RAG/index/vector_index.faiss"# FAISS 索引的保存路径
texts_pickle_path = r"D:/STUDY/Project/jizhouyao/RAG/index/texts.pkl"# 文本数据保存路径

# 检查路径是否存在
if not os.path.exists(model_path):
    raise FileNotFoundError(f"模型路径不存在: {model_path}")
if not os.path.exists(resources_path):
    raise FileNotFoundError(f"资源文件夹不存在: {resources_path}")

# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

# 文本存储
all_texts = []  # 存储所有文本内容

# 分句函数：使用正则表达式分割中文句子
def split_sentences(text):
    sentences = re.split(r'[' '。！？；：:\n]', text)  # 基于标点符号分割
    return [sentence.strip() for sentence in sentences if sentence.strip()]
# 向量化文本并动态选择索引类型
def vectorize_and_store(texts):
    """
    将文本分句并进行向量化，存入 FAISS 索引（句子级别）
    """
    print("开始向量化文本句子...")
    embeddings = []  # 存储向量
    global all_sentences  # 存储所有句子及其元数据
    all_sentences = []  # 重置存储

    # 遍历所有文本文件
    for idx, text_data in enumerate(texts):
        file_name = text_data["name"]  # 文件名
        content = text_data["content"]  # 文件内容
        # 分句
        sentences = split_sentences(content)  # 使用分句函数拆分文本
        for sentence in sentences:
            if not sentence.strip():  # 跳过空句子
                continue
            # 向量化每个句子
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
            # 归一化句子向量
            cls_embedding = normalize(cls_embedding.reshape(1, -1)).flatten()
            embeddings.append(cls_embedding)
            # 存储句子及其来源元数据
            all_sentences.append({
                "file": file_name,
                "sentence": sentence
            })
    # 将所有向量转换为 numpy 数组
    embeddings = np.array(embeddings)
    # 动态选择 FAISS 索引类型
    if len(embeddings) < 10:
        print("数据量较小，使用 IndexFlatL2 进行索引...")
        index = faiss.IndexFlatL2(embeddings.shape[1])  # 无需训练的索引
    else:
        num_clusters = max(1, min(10, len(embeddings) // 10))  # 根据数据量动态设置聚类数
        print(f"数据量较大，使用 IVFFlat 索引，聚类数: {num_clusters}")
        quantizer = faiss.IndexFlatL2(embeddings.shape[1])
        index = faiss.IndexIVFFlat(quantizer, embeddings.shape[1], num_clusters, faiss.METRIC_L2)
        print("训练 IVFFlat 索引...")
        index.train(embeddings)  # 训练索引
    # 添加向量到索引
    index.add(embeddings)
    print(f"已完成 {len(embeddings)} 条句子的向量化与索引！")
    return index

# 保存索引和文本内容
def save_index_and_texts(index):
    os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
    faiss.write_index(index, faiss_index_path)
    with open(texts_pickle_path, 'wb') as f:
        pickle.dump(all_texts, f)
    print("FAISS 索引和文本数据已保存！")

# 读取资源文件
def process_txt_files(resources_path):
    for filename in os.listdir(resources_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(resources_path, filename)
            print(f"正在处理文件: {file_path}")
            with open(file_path, 'r', encoding='utf-8') as file:
                all_texts.append({"name": filename, "content": file.read()})

# 主函数
if __name__ == '__main__':
    process_txt_files(resources_path)  # 读取文本数据
    index = vectorize_and_store(all_texts)  # 向量化并构建索引
    save_index_and_texts(index)  # 保存索引和文本数据
    print("索引构建完成！")