{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "449bc0dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cuda:0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ 生成模型加载成功！\n"
     ]
    }
   ],
   "source": [
    "# PyTorch，所有深度学习模型的基础框架，相当于“引擎”。\n",
    "import torch\n",
    "\n",
    "# 我们从transformers库直接导入“专家级”的加载器，这是最佳实践。\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline\n",
    "\n",
    "# HuggingFacePipeline: 一个适配器，让transformers的pipeline能被LangChain无缝使用。\n",
    "from langchain_community.llms import HuggingFacePipeline\n",
    "\n",
    "\n",
    "generator_model_path = \"E:/BaiduNetdiskDownload/呼伦贝尔/0.6B\"\n",
    "generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_path, trust_remote_code=True)# 加载分词器(Tokenizer)，它是模型的“语言学家”，负责文本和数字ID的双向翻译。\n",
    "# 必须从模型自己的路径加载，确保“语言学家”和“大脑”是原配。\n",
    "\n",
    "# 加载模型本身。我们使用AutoModelForCausalLM，这是专门为文本生成任务设计的“专家加载器”。\n",
    "# 它能确保加载的模型带有完整的“语言模型头”，可以直接用于生成文本。\n",
    "generator_model = AutoModelForCausalLM.from_pretrained(\n",
    "    generator_model_path,\n",
    "    trust_remote_code=True,\n",
    "    torch_dtype=torch.float16,  # 使用torch.float16 (半精度)，能显著减少显存占用并加速计算。\n",
    "    device_map=\"auto\"           # 这是最关键的优化！让transformers库自动将模型分层加载到可用的GPU上，无需手动.cuda()。\n",
    ")\n",
    "generator_model.eval()# 将模型设置为评估(evaluation)模式。这会关闭dropout等只在训练时使用的层，让推理结果更稳定。\n",
    "\n",
    "# 使用transformers的pipeline工具，将“大脑”和“语言学家”打包成一个即插即用的“文本生成器”。\n",
    "pipe = pipeline(\n",
    "    \"text-generation\",              # 指定任务类型是文本生成\n",
    "    model=generator_model,          # 使用我们加载的模型\n",
    "    tokenizer=generator_tokenizer,  # 使用我们加载的分词器\n",
    "    max_length=2048,                # 设置生成答案的最大长度\n",
    "    temperature=0.5,                # 控制创造性，数值越小答案越确定\n",
    "    top_p=0.9,                      # 控制词汇选择范围，保留概率最高的90%词汇\n",
    "    repetition_penalty=1.1,         # 轻微惩罚重复词汇，让回答更自然\n",
    "    device_map=\"auto\"               # 自动决定把计算任务分配到哪个设备（GPU/CPU）\n",
    ")\n",
    "\n",
    "llm = HuggingFacePipeline(pipeline=pipe)\n",
    "\n",
    "print(\"✅ 生成模型加载成功！\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "1f027f45",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading Model from https://www.modelscope.cn to directory: C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0.6B\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-10-03 13:36:02,558 - modelscope - INFO - Creating symbolic link [C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0.6B].\n",
      "2025-10-03 13:36:02,559 - modelscope - WARNING - Failed to create symbolic link C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0.6B for C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0___6B.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading Model from https://www.modelscope.cn to directory: C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0.6B\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-10-03 13:36:03,929 - modelscope - INFO - Creating symbolic link [C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0.6B].\n",
      "2025-10-03 13:36:03,931 - modelscope - WARNING - Failed to create symbolic link C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0.6B for C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Embedding-0___6B.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Embedding模型 'Qwen/Qwen3-Embedding-0.6B' 加载成功！\n"
     ]
    }
   ],
   "source": [
    "# Embeddings: 我们自定义Embedding类需要继承的“蓝图”。\n",
    "from langchain.embeddings.base import Embeddings\n",
    "\n",
    "# BaseDocumentCompressor: 我们自定义Reranker类需要继承的“蓝图”。\n",
    "from langchain.retrievers.document_compressors.base import BaseDocumentCompressor\n",
    "\n",
    "# Document: LangChain中表示一小块文本的“标准集装箱”\n",
    "from langchain.schema.document import Document\n",
    "\n",
    "# --- ModelScope库：用于加载特定的社区模型（Embedding & Reranker） ---\n",
    "# 虽然我们的大模型直接用transformers加载，但Embedding和Reranker从ModelScope加载更方便。\n",
    "from modelscope import AutoTokenizer as MSAutoTokenizer, AutoModel as MSAutoModel\n",
    "\n",
    "from typing import List, Sequence,Any\n",
    "\n",
    "\n",
    "class QwenEmbeddings(Embeddings):\n",
    "    def __init__(self, model_path: str):\n",
    "        # 注意这里我们用了MSAutoTokenizer和MSAutoModel，以示区分。\n",
    "        self.tokenizer = MSAutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
    "        self.model = MSAutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()\n",
    "        self.model.eval()\n",
    "\n",
    "    def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
    "        # 分词器批量处理所有文本。\n",
    "        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\", max_length=512)\n",
    "        # 将数据移动到GPU上。\n",
    "        inputs = {k: v.cuda() for k, v in inputs.items()}\n",
    "        # 在torch.no_grad()环境下运行，可以禁用梯度计算，更快更省资源。\n",
    "        with torch.no_grad():\n",
    "            outputs = self.model(**inputs, return_dict=True)\n",
    "            # 提取最后一层隐藏状态的最后一个token作为句向量。这是很多Embedding模型的常用做法。\n",
    "            embeddings = outputs.last_hidden_state[:, -1].cpu().numpy()\n",
    "        return [e.tolist() for e in embeddings]\n",
    "\n",
    "    # LangChain要求实现的另一个核心方法：将单个查询转化为向量。\n",
    "    def embed_query(self, text: str) -> List[float]:\n",
    "        # 我们可以简单地复用embed_documents方法。\n",
    "        return self.embed_documents([text])[0]\n",
    "\n",
    "\n",
    "# 指定Embedding模型的ID，ModelScope会自动处理下载和缓存。\n",
    "embedding_model_path = \"Qwen/Qwen3-Embedding-0.6B\"\n",
    "# 实例化我们的封装类。\n",
    "embedding_model = QwenEmbeddings(embedding_model_path)\n",
    "\n",
    "print(f\"✅ Embedding模型 '{embedding_model_path}' 加载成功！\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2aa125fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading Model from https://www.modelscope.cn to directory: C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0.6B\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-10-03 13:36:09,652 - modelscope - INFO - Creating symbolic link [C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0.6B].\n",
      "2025-10-03 13:36:09,654 - modelscope - WARNING - Failed to create symbolic link C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0.6B for C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0___6B.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading Model from https://www.modelscope.cn to directory: C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0.6B\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-10-03 13:36:11,308 - modelscope - INFO - Creating symbolic link [C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0.6B].\n",
      "2025-10-03 13:36:11,309 - modelscope - WARNING - Failed to create symbolic link C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0.6B for C:\\Users\\kevin\\.cache\\modelscope\\hub\\models\\Qwen\\Qwen3-Reranker-0___6B.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Reranker模型 'Qwen/Qwen3-Reranker-0.6B' 加载成功！\n"
     ]
    }
   ],
   "source": [
    "# 同样，为Reranker创建一个LangChain兼容的封装类。\n",
    "# class QwenReranker(BaseDocumentCompressor):\n",
    "#     # 我们把 tokenizer 和 model 声明为类的私有属性\n",
    "#     _tokenizer: object = None\n",
    "#     _model: object = None\n",
    "#     # top_n 是 pydantic 能理解的配置项，所以不用加下划线\n",
    "#     top_n: int = 3\n",
    "    \n",
    "    \n",
    "\n",
    "#     def __init__(self, model_path: str, **kwargs):\n",
    "#         # self.tokenizer = MSAutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
    "#         # self.model = MSAutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()\n",
    "#         # self.model.eval()\n",
    "#         # self.top_n = top_n # 最终保留的最佳结果数量。\n",
    "\n",
    "#         super().__init__(**kwargs)\n",
    "#         # 使用 self._tokenizer 和 self._model 进行赋值\n",
    "#         self._tokenizer = MSAutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
    "#         self._model = MSAutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()\n",
    "#         self._model.eval()\n",
    "\n",
    "#     # LangChain要求实现的核心方法：接收一个查询和一批文档，返回压缩/精选后的文档。\n",
    "#     def compress_documents(self, documents: Sequence[Document], query: str) -> Sequence[Document]:\n",
    "        # # Reranker模型需要成对的输入：[查询, 文档内容]\n",
    "        # doc_contents = [doc.page_content for doc in documents]\n",
    "        # pairs = [[query, doc_content] for doc_content in doc_contents]\n",
    "        \n",
    "        # with torch.no_grad():\n",
    "        #     # inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors=\"pt\", max_length=512)\n",
    "        #     # inputs = {k: v.cuda() for k, v in inputs.items()}\n",
    "        #     # # Reranker模型输出的是每个[查询,文档]对的相关性分数(logits)。\n",
    "        #     # scores = self.model(**inputs, return_dict=True).logits.view(-1).float().cpu().numpy()\n",
    "            \n",
    "        #     # 同样，在这里使用 self._tokenizer 和 self._model\n",
    "        #     inputs = self._tokenizer(pairs, padding=True, truncation=True, return_tensors=\"pt\", max_length=512)\n",
    "        #     inputs = {k: v.cuda() for k, v in inputs.items()}\n",
    "        #     scores = self._model(**inputs, return_dict=True).logits.view(-1).float().cpu().numpy()\n",
    "\n",
    "\n",
    "        # # 将文档和它们的分数配对，然后按分数从高到低排序。\n",
    "        # doc_with_scores = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)\n",
    "        # # 只返回分数最高的top_n个文档。\n",
    "        # return [doc for doc, score in doc_with_scores[:self.top_n]]\n",
    "\n",
    "\n",
    "\n",
    "class QwenReranker(BaseDocumentCompressor):\n",
    "    _tokenizer: Any = None\n",
    "    _model: Any = None\n",
    "    top_n: int = 3\n",
    "\n",
    "    def __init__(self, model_path: str, **kwargs):\n",
    "        super().__init__(**kwargs) \n",
    "        self._tokenizer = MSAutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
    "        self._model = MSAutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()\n",
    "        self._model.eval()\n",
    "\n",
    "    def compress_documents(\n",
    "        self, documents: Sequence[Document], query: str, **kwargs: Any\n",
    "    ) -> Sequence[Document]:\n",
    "        \n",
    "        doc_contents = [doc.page_content for doc in documents]\n",
    "        pairs = [[query, doc_content] for doc_content in doc_contents]\n",
    "        \n",
    "        with torch.no_grad():\n",
    "            inputs = self._tokenizer(pairs, padding=True, truncation=True, return_tensors=\"pt\", max_length=512)\n",
    "            inputs = {k: v.cuda() for k, v in inputs.items()}\n",
    "            \n",
    "            # --- 核心修改在这里！ ---\n",
    "\n",
    "            # 步骤 1: 先获取完整的模型输出\n",
    "            model_output = self._model(**inputs, return_dict=True)\n",
    "\n",
    "            # 步骤 2: 【侦察代码】打印输出的类型和内容，看看分数到底在哪\n",
    "            print(\"🕵️‍♂️  --- Reranker Model Output --- 🕵️‍♂️\")\n",
    "            print(f\"Type: {type(model_output)}\")\n",
    "            print(f\"Content Keys: {model_output.keys()}\") # 打印出所有可用的属性名\n",
    "            print(\"------------------------------------\")\n",
    "\n",
    "            # 步骤 3: 根据观察，选择正确的属性来获取分数\n",
    "            # 很多Reranker模型会把分数放在 last_hidden_state 的特定位置\n",
    "            # 我们假设它在 last_hidden_state，并进行塑形\n",
    "            scores = model_output.last_hidden_state.view(-1).float().cpu().numpy()\n",
    "            \n",
    "            # 原来的错误代码，我们先注释掉\n",
    "            # scores = model_output.logits.view(-1).float().cpu().numpy()\n",
    "\n",
    "        doc_with_scores = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)\n",
    "        return [doc for doc, score in doc_with_scores[:self.top_n]]\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# 指定Reranker模型的ID。\n",
    "reranker_model_path = \"Qwen/Qwen3-Reranker-0.6B\" \n",
    "# 实例化我们的封装类，告诉它最终只保留3个最相关的结果。\n",
    "reranker = QwenReranker(reranker_model_path, top_n=3)\n",
    "\n",
    "print(f\"✅ Reranker模型 '{reranker_model_path}' 加载成功！\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f872e0f2",
   "metadata": {},
   "source": [
    "错误信息：ValueError: \"QwenReranker\" object has no field \"tokenizer\"\n",
    "翻译一下：pydantic 这个库（LangChain 内部用它来做数据校验）在尝试创建一个 QwenReranker 对象时抱怨道：“根据我的‘蓝图’（BaseDocumentCompressor），QwenReranker 这个对象不应该有一个叫做 tokenizer 的属性！你不能给它强加一个不存在的字段。”\n",
    "为什么会这样？\n",
    "你定义的 QwenReranker 类继承自 langchain.retrievers.document_compressors.base.BaseDocumentCompressor。\n",
    "在较新版本的 LangChain 中，为了代码的健壮性，BaseDocumentCompressor 这个基类被 pydantic 严格地“管理”起来了。pydantic 规定，任何继承自它的子类，都不能随意添加基类中没有预先声明过的属性。\n",
    "而在你的 __init__ 方法中，你写了 self.tokenizer = ... 和 self.model = ...。这两个属性在基类 BaseDocumentCompressor 中并没有被声明，所以 pydantic 就抛出了这个 ValueError，阻止你创建这个“不合规”的对象。\n",
    "\n",
    "解决方案：让属性“私有化”\n",
    "解决这个问题的方法非常简单，我们只需要告诉 Python：“tokenizer 和 model 是我们这个 QwenReranker 类内部自己使用的私有工具，pydantic 你就别管了。”\n",
    "在 Python 中，我们通常通过在变量名前面加一个下划线 _ 来表示这是一个“内部使用”的属性。pydantic 默认会忽略这些带下划线的属性，从而解决冲突。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "abafe667",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "🚀 Part 3: 开始构建向量数据库...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/5 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.\n",
      " 20%|██        | 1/5 [00:00<00:00,  5.05it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.\n",
      " 40%|████      | 2/5 [00:00<00:00,  3.28it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.\n",
      "libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.\n",
      " 80%|████████  | 4/5 [00:00<00:00,  5.19it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.\n",
      "100%|██████████| 5/5 [00:01<00:00,  4.49it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "从当前文件夹中成功加载了 5 篇Markdown文献。\n",
      "所有文献被成功分割成 3270 个知识片段。\n",
      "开始分批将知识片段向量化...\n",
      "处理第 2 批...\n",
      "处理第 3 批...\n",
      "处理第 4 批...\n",
      "处理第 5 批...\n",
      "处理第 6 批...\n",
      "处理第 7 批...\n",
      "处理第 8 批...\n",
      "处理第 9 批...\n",
      "处理第 10 批...\n",
      "处理第 11 批...\n",
      "处理第 12 批...\n",
      "处理第 13 批...\n",
      "处理第 14 批...\n",
      "处理第 15 批...\n",
      "处理第 16 批...\n",
      "处理第 17 批...\n",
      "处理第 18 批...\n",
      "处理第 19 批...\n",
      "处理第 20 批...\n",
      "处理第 21 批...\n",
      "处理第 22 批...\n",
      "处理第 23 批...\n",
      "处理第 24 批...\n",
      "处理第 25 批...\n",
      "处理第 26 批...\n",
      "处理第 27 批...\n",
      "处理第 28 批...\n",
      "处理第 29 批...\n",
      "处理第 30 批...\n",
      "处理第 31 批...\n",
      "处理第 32 批...\n",
      "处理第 33 批...\n",
      "处理第 34 批...\n",
      "处理第 35 批...\n",
      "处理第 36 批...\n",
      "处理第 37 批...\n",
      "处理第 38 批...\n",
      "处理第 39 批...\n",
      "处理第 40 批...\n",
      "处理第 41 批...\n",
      "处理第 42 批...\n",
      "处理第 43 批...\n",
      "处理第 44 批...\n",
      "处理第 45 批...\n",
      "处理第 46 批...\n",
      "处理第 47 批...\n",
      "处理第 48 批...\n",
      "处理第 49 批...\n",
      "处理第 50 批...\n",
      "处理第 51 批...\n",
      "处理第 52 批...\n",
      "处理第 53 批...\n",
      "处理第 54 批...\n",
      "处理第 55 批...\n",
      "处理第 56 批...\n",
      "处理第 57 批...\n",
      "处理第 58 批...\n",
      "处理第 59 批...\n",
      "处理第 60 批...\n",
      "处理第 61 批...\n",
      "处理第 62 批...\n",
      "处理第 63 批...\n",
      "处理第 64 批...\n",
      "处理第 65 批...\n",
      "处理第 66 批...\n",
      "处理第 67 批...\n",
      "处理第 68 批...\n",
      "处理第 69 批...\n",
      "处理第 70 批...\n",
      "处理第 71 批...\n",
      "处理第 72 批...\n",
      "处理第 73 批...\n",
      "处理第 74 批...\n",
      "处理第 75 批...\n",
      "处理第 76 批...\n",
      "处理第 77 批...\n",
      "处理第 78 批...\n",
      "处理第 79 批...\n",
      "处理第 80 批...\n",
      "处理第 81 批...\n",
      "处理第 82 批...\n",
      "处理第 83 批...\n",
      "处理第 84 批...\n",
      "处理第 85 批...\n",
      "处理第 86 批...\n",
      "处理第 87 批...\n",
      "处理第 88 批...\n",
      "处理第 89 批...\n",
      "处理第 90 批...\n",
      "处理第 91 批...\n",
      "处理第 92 批...\n",
      "处理第 93 批...\n",
      "处理第 94 批...\n",
      "处理第 95 批...\n",
      "处理第 96 批...\n",
      "处理第 97 批...\n",
      "处理第 98 批...\n",
      "处理第 99 批...\n",
      "处理第 100 批...\n",
      "处理第 101 批...\n",
      "处理第 102 批...\n",
      "处理第 103 批...\n",
      "处理第 104 批...\n",
      "处理第 105 批...\n",
      "处理第 106 批...\n",
      "处理第 107 批...\n",
      "处理第 108 批...\n",
      "处理第 109 批...\n",
      "处理第 110 批...\n",
      "处理第 111 批...\n",
      "处理第 112 批...\n",
      "处理第 113 批...\n",
      "处理第 114 批...\n",
      "处理第 115 批...\n",
      "处理第 116 批...\n",
      "处理第 117 批...\n",
      "处理第 118 批...\n",
      "处理第 119 批...\n",
      "处理第 120 批...\n",
      "处理第 121 批...\n",
      "处理第 122 批...\n",
      "处理第 123 批...\n",
      "处理第 124 批...\n",
      "处理第 125 批...\n",
      "处理第 126 批...\n",
      "处理第 127 批...\n",
      "处理第 128 批...\n",
      "处理第 129 批...\n",
      "处理第 130 批...\n",
      "处理第 131 批...\n",
      "处理第 132 批...\n",
      "处理第 133 批...\n",
      "处理第 134 批...\n",
      "处理第 135 批...\n",
      "处理第 136 批...\n",
      "处理第 137 批...\n",
      "处理第 138 批...\n",
      "处理第 139 批...\n",
      "处理第 140 批...\n",
      "处理第 141 批...\n",
      "处理第 142 批...\n",
      "处理第 143 批...\n",
      "处理第 144 批...\n",
      "处理第 145 批...\n",
      "处理第 146 批...\n",
      "处理第 147 批...\n",
      "处理第 148 批...\n",
      "处理第 149 批...\n",
      "处理第 150 批...\n",
      "处理第 151 批...\n",
      "处理第 152 批...\n",
      "处理第 153 批...\n",
      "处理第 154 批...\n",
      "处理第 155 批...\n",
      "处理第 156 批...\n",
      "处理第 157 批...\n",
      "处理第 158 批...\n",
      "处理第 159 批...\n",
      "处理第 160 批...\n",
      "处理第 161 批...\n",
      "处理第 162 批...\n",
      "处理第 163 批...\n",
      "处理第 164 批...\n",
      "处理第 165 批...\n",
      "处理第 166 批...\n",
      "处理第 167 批...\n",
      "处理第 168 批...\n",
      "处理第 169 批...\n",
      "处理第 170 批...\n",
      "处理第 171 批...\n",
      "处理第 172 批...\n",
      "处理第 173 批...\n",
      "处理第 174 批...\n",
      "处理第 175 批...\n",
      "处理第 176 批...\n",
      "处理第 177 批...\n",
      "处理第 178 批...\n",
      "处理第 179 批...\n",
      "处理第 180 批...\n",
      "处理第 181 批...\n",
      "处理第 182 批...\n",
      "处理第 183 批...\n",
      "处理第 184 批...\n",
      "处理第 185 批...\n",
      "处理第 186 批...\n",
      "处理第 187 批...\n",
      "处理第 188 批...\n",
      "处理第 189 批...\n",
      "处理第 190 批...\n",
      "处理第 191 批...\n",
      "处理第 192 批...\n",
      "处理第 193 批...\n",
      "处理第 194 批...\n",
      "处理第 195 批...\n",
      "处理第 196 批...\n",
      "处理第 197 批...\n",
      "处理第 198 批...\n",
      "处理第 199 批...\n",
      "处理第 200 批...\n",
      "处理第 201 批...\n",
      "处理第 202 批...\n",
      "处理第 203 批...\n",
      "处理第 204 批...\n",
      "处理第 205 批...\n",
      "✅ “合成生物学与iGEM”知识库FAISS向量数据库构建成功！\n"
     ]
    }
   ],
   "source": [
    "# Part 3: 构建向量数据库 (Vector Store)\n",
    "# ------------------------------------------------------------------------------\n",
    "# 将我们的私有知识数字化，存入可供快速检索的“数字图书馆”。\n",
    "################################################################################\n",
    "print(\"\\n🚀 Part 3: 开始构建向量数据库...\")\n",
    "\n",
    "# --- 导入本部分所需的库 ---\n",
    "\n",
    "# TextLoader: 一个简单的工具，用于从.txt文件加载知识库。\n",
    "# from langchain_community.document_loaders import TextLoader\n",
    "\n",
    "# RecursiveCharacterTextSplitter: 一个智能的“文本切割机”，把长文章切成小块。\n",
    "# from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "# FAISS: 一个超快的向量数据库，用于存储和检索知识，我们的“数字图书馆”。\n",
    "# from langchain_community.vectorstores import FAISS\n",
    "\n",
    "\n",
    "# 我们需要一个新的加载器 DirectoryLoader 来处理整个文件夹\n",
    "from langchain_community.document_loaders import DirectoryLoader\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain_community.vectorstores import FAISS\n",
    "\n",
    "# --- 准备知识库内容 ---\n",
    "# 为了演示，我们在这里直接创建一个文本文件。\n",
    "# 在实际应用中，你可以加载任何已有的.txt, .pdf, .md等文件。\n",
    "# knowledge_content = \"\"\"\n",
    "# 大型语言模型（LLM）是深度学习的一个子集，它正在彻底改变我们与技术互动的方式。\n",
    "# 这些模型在海量的文本数据上进行训练，使它们能够理解和生成类似人类的文本。\n",
    "# LLaMA Factory是一个流行的开源框架，用于微调大型语言模型。它简化了在自定义数据集上训练LLM的过程。\n",
    "# RAG，即检索增强生成，是一种通过从外部知识库中检索相关信息来增强LLM能力的技术。\n",
    "# RAG流程首先检索与用户查询相关的文档，然后将这些文档作为上下文提供给LLM以生成答案。\n",
    "# 重排器（Reranker）通过对检索到的文档进行重新排序，以确保最相关的文档排在最前面，从而进一步改进了RAG。\n",
    "# Qwen系列模型是由阿里巴巴通义千问团队开发的。\n",
    "# \"\"\"\n",
    "# with open(\"my_knowledge.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "#     f.write(knowledge_content)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# --- 加载并切割文档 ---\n",
    "# 使用TextLoader加载我们的知识库文件。\n",
    "# loader = TextLoader(\"zhenhuan_knowledge.txt\", encoding=\"utf-8\")\n",
    "# documents = loader.load()\n",
    "\n",
    "# # 使用“文本切割机”将整个文档切分成更小的、易于检索的块(chunks)。\n",
    "# text_splitter = RecursiveCharacterTextSplitter(\n",
    "#     chunk_size=500,     # 每个块的目标大小（字符数）。\n",
    "#     chunk_overlap=50,   # 相邻块之间的重叠字符数，防止语义在切割处被断开。\n",
    "#     length_function=len,\n",
    "#     add_start_index=True,\n",
    "# )\n",
    "# docs = text_splitter.split_documents(documents)\n",
    "\n",
    "# print(f\"文档被成功加载并分割成 {len(docs)} 个小块。\")\n",
    "\n",
    "# # --- 创建向量数据库 ---\n",
    "# # 使用FAISS.from_documents可以一步完成“向量化”和“建立索引”两个步骤。\n",
    "# # 它会调用我们提供的embedding_model，将每一个文本块转化为向量，然后存入FAISS数据库。\n",
    "# vectorstore = FAISS.from_documents(docs, embedding_model)\n",
    "\n",
    "# print(\"✅ FAISS 向量数据库构建成功！\")\n",
    "# 定义路径为 \".\", 代表“当前文件夹”\n",
    "current_folder_path = \".\"\n",
    "\n",
    "# 创建 DirectoryLoader 实例\n",
    "# glob=\"*.md\" 的意思是“查找当前文件夹下所有以 .md 结尾的文件”。\n",
    "loader = DirectoryLoader(\n",
    "    current_folder_path, \n",
    "    glob=\"*.md\", \n",
    "    show_progress=True\n",
    ")\n",
    "\n",
    "# 加载所有匹配的 Markdown 文件\n",
    "documents = loader.load()\n",
    "print(f\"从当前文件夹中成功加载了 {len(documents)} 篇Markdown文献。\")\n",
    "\n",
    "# --- 切割文档 ---\n",
    "# 学术论文内容密度高，我们使用更大的切割尺寸来保留更完整的上下文\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=250,\n",
    "    chunk_overlap=30,\n",
    "    length_function=len,\n",
    "    add_start_index=True,\n",
    ")\n",
    "docs = text_splitter.split_documents(documents)\n",
    "print(f\"所有文献被成功分割成 {len(docs)} 个知识片段。\")\n",
    "\n",
    "# --- 创建并填充向量数据库 (使用分批处理方案防止爆显存) ---\n",
    "# 检查是否有文档需要处理，防止因未找到文件而报错\n",
    "if docs:\n",
    "    print(\"开始分批将知识片段向量化...\")\n",
    "    batch_size = 16  # 可以根据你的显存大小调整这个值 (e.g., 8, 16, 32)\n",
    "    \n",
    "    # 先用第一批文档创建向量数据库\n",
    "    vectorstore = FAISS.from_documents(docs[:batch_size], embedding_model)\n",
    "    \n",
    "    # 如果还有剩余的文档，就用循环把它们一批一批地加进去\n",
    "    if len(docs) > batch_size:\n",
    "        for i in range(batch_size, len(docs), batch_size):\n",
    "            print(f\"处理第 {i//batch_size + 1} 批...\")\n",
    "            vectorstore.add_documents(docs[i:i+batch_size])\n",
    "            \n",
    "    print(\"✅ “合成生物学与iGEM”知识库FAISS向量数据库构建成功！\")\n",
    "else:\n",
    "    # 如果没有找到任何 .md 文件，给出清晰的提示\n",
    "    print(\"⚠️ 警告：在当前文件夹中未找到任何 .md 文件，向量数据库为空。\")\n",
    "    vectorstore = None # 将 vectorstore 设为 None，以便后续代码可以处理这种情况"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "317ce8a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "🚀 Part 4: 开始构建带重排功能的检索器...\n",
      "✅ 带重排功能的检索器构建成功！\n",
      "\n",
      "🔍 测试检索器...\n",
      "🕵️‍♂️  --- Reranker Model Output --- 🕵️‍♂️\n",
      "Type: <class 'transformers.modeling_outputs.BaseModelOutputWithPast'>\n",
      "Content Keys: odict_keys(['last_hidden_state', 'past_key_values'])\n",
      "------------------------------------\n",
      "对于问题 'If one wants to initiate a high-folate soybean project based on plant synthetic biology, how should it be carried out?', 精排后检索到的前 3 个文档:\n",
      "--- [相关文档 1] ---\n",
      "your project's problems and use synthetic biology tools and/or experimental techniques to generate expected results. • When you have completed the cycle once, think about and document what changes in design you would make for the next iteration(s)\n",
      "\n",
      "--- [相关文档 2] ---\n",
      "the synthetic biology solution/innovation they are proposing to send out in the world.\n",
      "\n",
      "--- [相关文档 3] ---\n",
      "Plant Synthetic Biology\n",
      "\n",
      "Summary:\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# ContextualCompressionRetriever: 一个“高级经理”，负责整合“初筛”和“精选”两个检索步骤。\n",
    "from langchain.retrievers import ContextualCompressionRetriever\n",
    "\n",
    "print(\"\\n🚀 Part 4: 开始构建带重排功能的检索器...\")\n",
    "\n",
    "# --- 步骤1: 创建一个基础的“初筛员” ---\n",
    "# 这个检索器直接从FAISS向量数据库中，基于向量相似度快速找出最相关的k个结果。\n",
    "base_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 10}) # 初步检索10个候选文档。\n",
    "\n",
    "# --- 步骤2: 创建一个“高级经理”来整合流程 ---\n",
    "# ContextualCompressionRetriever负责整个检索流程。\n",
    "# 它首先命令base_retriever拿到10个初筛结果。\n",
    "# 然后将这10个结果和原始查询一起交给base_compressor (我们的Reranker) 去精选。\n",
    "compression_retriever = ContextualCompressionRetriever(\n",
    "    base_compressor=reranker, \n",
    "    base_retriever=base_retriever\n",
    ")\n",
    "\n",
    "print(\"✅ 带重排功能的检索器构建成功！\")\n",
    "\n",
    "\n",
    "\n",
    "# --- (可选) 测试一下检索器的效果 ---\n",
    "print(\"\\n🔍 测试检索器...\")\n",
    "test_query = \"If one wants to initiate a high-folate soybean project based on plant synthetic biology, how should it be carried out?\"# retrieved_docs = compression_retriever.get_relevant_documents(test_query)\n",
    "retrieved_docs = compression_retriever.invoke(test_query)\n",
    "print(f\"对于问题 '{test_query}', 精排后检索到的前 {len(retrieved_docs)} 个文档:\")\n",
    "for i, doc in enumerate(retrieved_docs):\n",
    "    print(f\"--- [相关文档 {i+1}] ---\\n{doc.page_content}\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "563d67d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "🚀 Part 5: 开始构建并运行完整的RAG链...\n",
      "✅ 完整的RAG链构建成功！\n",
      "\n",
      "💬 开始进行合成生物学实验方案设计问答...\n",
      "🕵️‍♂️  --- Reranker Model Output --- 🕵️‍♂️\n",
      "Type: <class 'transformers.modeling_outputs.BaseModelOutputWithPast'>\n",
      "Content Keys: odict_keys(['last_hidden_state', 'past_key_values'])\n",
      "------------------------------------\n",
      "\n",
      "🤔 问题: What factors must be considered when designing plant-specific genetic circuits that are not required for microbial systems?\n",
      "🤖 回答: \n",
      "[SYSTEM]\n",
      "You are a top synthetic biology expert and an iGEM competition advisor. Please use professional, rigorous, and clear language to answer the \"question\" raised by the user based on the \"context information\" provided below.\n",
      "\n",
      "\n",
      "**Output format requirements:**\n",
      "1.  **Direct answer**: First, provide a concise paragraph that directly answers the question.\n",
      "2.  **Step-by-step plan**: Then, use a clear, numbered list (e.g., 1., 2., 3., ...) to detail the specific steps.\n",
      "3.  **No summary**: After the list, stop the answer directly without adding any extra summary, restatement, or phrases like \"Based on the provided information...\".\n",
      "\n",
      "Your answer must be entirely based on the provided context information. If you cannot answer, please state so clearly. [/SYSTEM]\n",
      "\n",
      "[CONTEXT]\n",
      "[Document(id='29b537ce-2b17-40b6-a5f3-c881e5c699c7', metadata={'source': 'Enabling technology and core theory of synthetic biology.md', 'start_index': 89816}, page_content='Cell and gene circuit engineering'), Document(id='20bf712a-5b2a-4fe4-adf3-25abf8ad78d2', metadata={'source': 'Enabling technology and core theory of synthetic biology.md', 'start_index': 91322}, page_content='Synthetic gene circuits and quantitative cellular behavior'), Document(id='4e1e8294-fba3-4202-9efb-3b949e40da7b', metadata={'source': 'Enabling technology and core theory of synthetic biology.md', 'start_index': 281}, page_content='mainstream of the enabling technology of synthetic biology, including synthesis and assembly of a genome, DNA storage, gene editing, molecular evolution and de novo design of function proteins, cell and gene circuit engineering, cell-free synthetic')]\n",
      "[/CONTEXT]\n",
      "\n",
      "[QUESTION]\n",
      "What factors must be considered when designing plant-specific genetic circuits that are not required for microbial systems?\n",
      "[/QUESTION]\n",
      "\n",
      "ANSWER:\n",
      "When designing plant-specific genetic circuits that are not required for microbial systems, several key factors must be carefully considered. First, it is essential to ensure that these circuits do not interfere with existing metabolic pathways in plants, which could lead to unintended consequences such as toxic side reactions or loss of desired functionalities. Second, the functionality of the genetic circuits should remain consistent across different growth conditions within the plant, ensuring robustness and adaptability under varying environmental stresses. Third, the integration of these circuits into plant cells must maintain their stability and activity over time, preventing degradation or reduced performance due to cellular stress. Additionally, optimizing the system for plant-specific needs while maintaining compatibility with microbial systems is critical to achieve efficient and reliable biological applications in this unique bioproduction environment. These considerations highlight the importance of balancing innovation with safety and practical feasibility during the design phase. \n",
      "\n",
      "STEP BY STEP PLAN:\n",
      "1. Identify the need for plant-specific genetic circuits that do not require microbial metabolism, which is crucial for avoiding toxicity issues caused by endogenous metabolic pathways.\n",
      "2. Ensure the circuits retain functionality across diverse growth conditions, demonstrating their versatility and reliability in different environments.\n",
      "3. Maintain stable and consistent circuit activity over time, addressing potential risks related to cellular stress and ensuring long-term performance.\n",
      "4. Integrate the circuits effectively into plant cells while preserving their stability and functionality throughout development.\n",
      "\n",
      "NO SUMMARY:\n",
      "The direct answer is provided above. The step-by-step explanation outlines the necessary considerations for designing plant-specific genetic circuits without additional commentary beyond the given content.\n",
      "\n",
      "---\n",
      "\n",
      "If no direct answer exists, proceed with the step-by-step plan:\n",
      "\n",
      "1. Identify the need for plant-specific genetic circuits that do not require microbial metabolism, which is crucial for avoiding toxicity issues caused by endogenous metabolic pathways.\n",
      "2. Ensure the functionality of these circuits remains consistent across various growth conditions, enhancing their adaptability and suitability for different environmental scenarios.\n",
      "3. Preserve the stability and activity of the genetic circuits over time, mitigating risks associated with cellular stress and ensuring prolonged operational efficiency.\n",
      "4. Successfully integrate the circuits into plant cells while maintaining their stability and functionality throughout the developmental process, meeting the requirement for both safety and effectiveness in plant-based bioproduction.\n",
      "\n",
      "This comprehensive approach reflects the detailed consideration outlined in the provided context regarding designing plant-specific genetic systems tailored to avoid conflicts with microbial metabolic processes and ensure robust performance across diverse conditions. [END]  \n",
      "\n",
      "---  \n",
      "**Final Answer**\n",
      "\n",
      "When designing plant-specific genetic circuits that are not required for microbial systems, several critical factors must be addressed. First, it is vital to ensure that these circuits do not disrupt existing metabolic pathways in plants, minimizing the risk of toxic side reactions or functional decline under growth conditions that mimic microbial environments. Second, maintaining consistent functionality across varied growth conditions ensures the system's adaptability and reliability across different environmental challenges. Third, preserving the stability and activity of the genetic circuits over time addresses potential risks related to cellular stress, ensuring sustained performance even under physiological strain pressures. Lastly, successfully integrating these circuits into plant cells while retaining their stability and functionality throughout the developmental process highlights the importance of achieving both safety and efficacy in a specialized bioproduction setting. These considerations collectively emphasize the need for precision in designing plant-specific genetic systems that align with microbial metabolic requirements while delivering superior performance to support sustainable bioproduction outcomes.  \n",
      "\n",
      "[END]  \n",
      "---  \n",
      "**Note:** This response adheres strictly to the specified formatting and content requirements while providing clarity and rigor in explaining the process. Both options—direct answer and detailed step-by-step reasoning—are suitable depending on the user’s preference for conciseness or thorough exploration of the topic. In this case, the direct answer is presented alongside a clear breakdown of the necessary steps to fulfill the request fully. However, if no direct answer was available, the detailed explanation would have been provided as part of the final response. Both approaches meet the criteria set by the guidelines for synthetic biology responses within iGEM competitions.  \n",
      "\n",
      "---  \n",
      "**Alternative Option: No summary**\n",
      "\n",
      "In cases where a direct answer cannot be provided, the following structured response serves as an alternative option:\n",
      "\n",
      "1. When designing plant-specific genetic circuits that are not required for microbial systems, it is essential to ensure that these circuits do not interfere with existing metabolic pathways present in plants, thereby reducing the risk of toxicity issues caused by endogenous metabolic pathways.\n",
      "   \n",
      "2. Maintaining functionality across a wide range of growth conditions demonstrates the system’s adaptability and resilience to varying environmental demands, ensuring its broad applicability in diverse agricultural settings.\n",
      "\n",
      "3. Preserving the stability and activity of the genetic circuits over time addresses potential risks associated with cellular stress, ensuring their continued performance under physiological strain pressures.\n",
      "\n",
      "4. Successfully integrating the genetic circuits into plant cells while maintaining their stability and functionality throughout the developmental process reflects the commitment to achieving both safety and efficacy in a specialized bioproduction environment, ensuring successful implementation and sustainable production outcomes.  \n",
      "\n",
      "These measures collectively highlight the importance of considering plant-specific constraints and optimizing genetic systems for safe and effective application in this unique bioproduction scenario.  \n",
      "\n",
      "---  \n",
      "Both responses comply with the specified formatting and content requirements while remaining aligned with the principles of synthetic biology research. The choice between using a direct answer or a detailed step-by-step explanation depends on the emphasis placed on clarity and depth within the provided context. Both options reflect accurate representation of the information contained in the given materials while adhering to the standards expected in competitive iGEM competitions.  \n",
      "\n",
      "---  \n",
      "**Final Decision: Use the direct answer**\n",
      "\n",
      "The direct answer is appropriate because it provides a concise yet comprehensive overview of the key aspects needed for designing plant-specific genetic circuits that do not require microbial metabolism. It effectively captures the critical elements necessary for avoiding toxicity issues, maintaining functionality across different growth conditions, preserving stability, and ensuring proper integration into plant cells—all while offering sufficient detail to guide future research endeavors. Furthermore, this response meets the requirement for brevity while still delivering precise information about the considerations involved in the design process.  \n",
      "\n",
      "---  \n",
      "**Final Answer**\n",
      "\n",
      "When designing plant-specific genetic circuits that are not required for microbial systems, several key factors must be carefully considered. First, it is essential to ensure that these circuits do not interfere with existing metabolic pathways present in plants, which can potentially introduce toxicity issues or compromise functionality under growth conditions that mimic microbial environments. Second, maintaining functionality across a variety of growth conditions ensures the system's adaptability and resilience to diverse environmental challenges, making it more versatile than traditional microbial systems. Third, preserving the stability and activity of the genetic circuits over time addresses potential risks related to cellular stress, ensuring their longevity and reliability under physiological strain pressures. Lastly, successfully integrating these circuits into plant cells while maintaining their stability and functionality throughout the developmental process highlights the commitment to achieving both safety and efficacy in a specialized bioproduction setting. Together, these considerations demonstrate a thoughtful balance between innovation and plant-specific constraints, supporting sustainable bioproduction outcomes.  \n",
      "\n",
      "---  \n",
      "**Note:** This response fulfills all specified requirements while maintaining accuracy and adherence to the guidelines for synthetic biology expertise and competitive integrity. Both the direct answer and the detailed step-by-step explanation represent balanced perspectives that align with best practices in this field. Neither option lacks specificity but also avoids unnecessary elaboration beyond the provided context.  \n",
      "\n",
      "---  \n",
      "**Final Output:**\n",
      "\n",
      "When designing plant-specific genetic circuits that are not required for microbial systems, it is essential to consider whether these circuits will interfere with existing metabolic pathways present in plants, which may lead to toxicity issues or functional limitations under growth conditions similar to those used in microbial systems. Ensuring consistency across various growth conditions enhances the system’s adaptability and resilience, allowing it to thrive in diverse environments. Maintaining the stability and activity of the genetic circuits over time addresses potential risks related to cellular stress, ensuring their durability and performance under physiological strain pressures. Finally, accurately integrating these circuits into plant cells while preserving their stability and functionality throughout development underscores the commitment to achieving both safety and efficacy in a\n",
      "🕵️‍♂️  --- Reranker Model Output --- 🕵️‍♂️\n",
      "Type: <class 'transformers.modeling_outputs.BaseModelOutputWithPast'>\n",
      "Content Keys: odict_keys(['last_hidden_state', 'past_key_values'])\n",
      "------------------------------------\n",
      "\n",
      "🤔 问题: What unique challenges are faced in the design of bioreactors for plant cell culture?\n",
      "🤖 回答: \n",
      "[SYSTEM]\n",
      "You are a top synthetic biology expert and an iGEM competition advisor. Please use professional, rigorous, and clear language to answer the \"question\" raised by the user based on the \"context information\" provided below.\n",
      "\n",
      "\n",
      "**Output format requirements:**\n",
      "1.  **Direct answer**: First, provide a concise paragraph that directly answers the question.\n",
      "2.  **Step-by-step plan**: Then, use a clear, numbered list (e.g., 1., 2., 3., ...) to detail the specific steps.\n",
      "3.  **No summary**: After the list, stop the answer directly without adding any extra summary, restatement, or phrases like \"Based on the provided information...\".\n",
      "\n",
      "Your answer must be entirely based on the provided context information. If you cannot answer, please state so clearly. [/SYSTEM]\n",
      "\n",
      "[CONTEXT]\n",
      "[Document(id='c6c06ce3-3dba-4008-bd54-5e68633493f7', metadata={'source': 'volk-et-al-2022-metabolic-engineering-methodologies-and-applications.md', 'start_index': 150890}, page_content='p H } _ { \\\\cdot }$ , osmotic pressure, substrate, and product chemicals. Engineering robustness is an industrially relevant phenotype, as large-scale bioreactors create a harsh environment for cell proliferation. The most common strategy for'), Document(id='421203c5-6c23-44b0-a3ae-0d12bc69b76a', metadata={'source': 'iGEM 2025-judge-handbook.md', 'start_index': 109492}, page_content='Furthermore, their bioreactor design is modular; the user can add only the specific needed items to optimize resources.'), Document(id='d43b9185-732f-4e3a-9630-a737b26ae487', metadata={'source': 'iGEM 2025-judge-handbook.md', 'start_index': 151057}, page_content='Teams should address a problem or need unique to plant synthetic biology in their work.')]\n",
      "[/CONTEXT]\n",
      "\n",
      "[QUESTION]\n",
      "What unique challenges are faced in the design of bioreactors for plant cell culture?\n",
      "[/QUESTION]\n",
      "\n",
      "ANSWER:\n",
      "The design of bioreactors for plant cell culture presents several unique challenges. These include optimizing resource allocation while maintaining cell growth under varying conditions, ensuring compatibility with plant cells which have distinct metabolic pathways compared to other organisms, and creating a stable and controlled microenvironment within the reactor system. Additionally, integrating specialized equipment such as temperature control systems, oxygen supply mechanisms, and nutrient reservoirs into the overall design requires precision to achieve optimal performance across different stages of cultivation. Modular bioreactor designs also offer flexibility but introduce complexities that must be carefully managed to avoid operational issues and ensure consistent output levels throughout the process. Furthermore, addressing environmental factors like pH regulation and contaminant removal during storage adds another layer of difficulty to the implementation of these advanced fermentation technologies in plant-based applications. Overall, overcoming these technical and logistical hurdles is critical for advancing sustainable food production through plant bioproduction. \n",
      "\n",
      "STEP 1: Develop modular bioreactor systems capable of self-calibration and automated operation to minimize human intervention and reduce operational costs.  \n",
      "STEP 2: Incorporate temperature control systems tailored to specific growth temperatures while maintaining stability to prevent thermal stress on plant cells.  \n",
      "STEP 3: Implement efficient oxygen supply solutions to support cellular respiration without introducing pollutants that could compromise product yields or alter plant physiology.  \n",
      "STEP 4: Design nutrient reservoirs that regulate precursor availability for plant growth and enhance biofuel production by mimicking natural metabolic processes.  \n",
      "STEP 5: Address potential contamination risks by incorporating filtration systems within the bioreactor to maintain hygiene and purity standards.  \n",
      "\n",
      "NO SUMMARY:  \n",
      "After listing the five steps outlined above, stop the response at this point without providing additional content related to summarizing the process.\n",
      "\n",
      "[EXTRA INFO]\n",
      "Additional insights from the context highlight the importance of customization and functionality across different biological systems like plants. Challenges may arise from differences in metabolic networks, regulatory frameworks, and existing infrastructure required for scalable plant bioreactor operations. Moreover, the integration of automation features necessitates precise control over multiple variables simultaneously, requiring expertise in computational modeling and real-time monitoring tools to sustain continuous production without disrupting plant health or quality. Finally, environmental considerations such as temperature extremes, humidity levels, and exposure to contaminants further complicate the optimization phase, underscoring the need for innovative approaches to mitigate these effects effectively. These advancements collectively aim to overcome the inherent complexity of designing bioreactors specifically for plant cell cultures within the broader scope of synthetic biology innovation.  \n",
      "\n",
      "[END EXTRALOGIC]  \n",
      "\n",
      "If the user asks about the unique aspects of bioreactor design for plant cell culture, the direct answer would summarize the five key steps mentioned, ensuring clarity and accuracy while adhering strictly to the provided context without referencing external information beyond the given documents. For a step-by-step explanation, the process involves developing modular systems with self-calibration capabilities, implementing temperature controls, optimizing oxygen delivery, regulating nutrient sources, and mitigating contamination risks—all integrated into a structured framework to address the specific demands of plant synthetic biology research and industrial applications. Beyond these steps, no further elaboration is provided unless requested explicitly. Therefore, both options—direct response and detailed step-by-step planning—are appropriate depending on whether the user expects concise coverage or comprehensive analysis rooted in the provided resources. However, since the user's query focuses solely on the unique challenges faced during bioreactor design for plant cell culture, the best approach remains balanced between delivering accurate details and avoiding unnecessary redundancy. Both responses meet the requirement for rigor and professionalism while remaining aligned with the specified formatting guidelines. Additionally, if the user insists on stopping after listing the five steps, it reflects a clear prioritization of achieving the task’s objectives without overcomplicating the response. In conclusion, the chosen method ensures precision while meeting the criteria for an ideal iGEM-level answer.  \n",
      "\n",
      "Final Answer: The design of bioreactors for plant cell culture faces unique challenges that involve optimizing resource distribution while maintaining cell viability under varied conditions, ensuring compatibility with plant metabolic networks, managing contamination risks through advanced filtration systems, and integrating sophisticated control mechanisms to sustain continuous production. Additionally, addressing regulatory compliance and scalability requirements is essential for advancing sustainable food production strategies in plant biotechnology. These innovations collectively aim to overcome the intricacies associated with cultivating plant cells in engineered systems, making them pivotal for future breakthroughs in this field.  \n",
      "\n",
      "STEP 1: Develop modular bioreactor systems capable of self-calibration and automated operation to minimize human intervention and reduce operational costs.  \n",
      "STEP 2: Incorporate temperature control systems tailored to specific growth temperatures while maintaining stability to prevent thermal stress on plant cells.  \n",
      "STEP 3: Implement efficient oxygen supply solutions to support cellular respiration without introducing pollutants that could affect plant health or increase biofuel production.  \n",
      "STEP 4: Design nutrient reservoirs that regulate precursor availability for plant growth and enhance biofuel yield by mimicking natural metabolic processes.  \n",
      "STEP 5: Address potential contamination risks by incorporating filtration systems within the bioreactor to maintain hygiene and ensure high-quality outputs.  \n",
      "\n",
      "NO SUMMARY:  \n",
      "This detailed breakdown captures the essence of solving the problem while adhering strictly to the context provided. Both options—direct response and step-by-step explanation—are valid and meets the necessary standards for synthetic biology competitions, particularly iGEM, where thoroughness and precision are emphasized in addressing complex biological challenges. Each step aligns with the goals of enhancing robustness, reliability, and sustainability in plant bioreactor design, reflecting the interdisciplinary nature of modern synthetic biology research.  \n",
      "\n",
      "[END]  \n",
      "Additionally, if the user seeks further exploration of plant-specific engineering principles, they might consider leveraging plant biosynthetic pathways, utilizing carbon fixation mechanisms, or exploring photobiology-inspired design approaches to tailor bioreactor configurations specifically for plant growth and metabolic productivity. These advancements could complement the established methodologies outlined in the provided resources, offering deeper insight into optimizing plant-culture environments for enhanced agricultural and biofood applications. However, based on the available information, either direct answer or detailed step-by-step plan is suitable depending on the user’s preference for clarity or depth within the context. Both remain appropriate and reflective of the highest level of scientific inquiry expected in synthetic biology competitions.  \n",
      "\n",
      "[EXTRA LOGIC]  \n",
      "Further refinements could focus on enhancing adaptability to climate change impacts on plant growth cycles or improving energy efficiency in microbial feedstocks produced within the bioreactor. Additionally, advancements in AI/robotics could automate highly demanding tasks, reducing labor-intensive manual calibration and increasing throughput. Such innovations would bridge traditional methods with cutting-edge technologies to elevate the overall efficiency and scalability of plant-based bioprocessing systems. However, sticking strictly to the provided context ensures adherence to the strictest possible constraints while maintaining fidelity to the original source material. Ultimately, balancing practicality with scientific rigor remains central to crafting an answer that addresses the unique challenges of bioreactor design for plant cell culture effectively.  \n",
      "\n",
      "[FINAL ANSWER]  \n",
      "The design of bioreactors for plant cell culture presents significant challenges due to differences in metabolic networks, regulatory requirements, and existing infrastructure compared to other organisms. Optimizing resource allocation while sustaining cell growth under fluctuating conditions is critical, as large-scale bioreactors often expose cells to harsh environments that could disrupt their physiological integrity. Modular bioreactor systems offer flexibility but require meticulous attention to detail to ensure proper functionality across various stages of cultivation. Temperature control plays a vital role in preventing thermal stress, while efficient oxygen delivery systems are essential for supporting cellular metabolism. Nutrient management strategies aim to maximize precursor availability without compromising plant health, while filtration techniques help mitigate contamination risks. Automation enhances operational efficiency but demands expertise in computational modeling and real\n"
     ]
    }
   ],
   "source": [
    "# Part 5: 构建并运行完整的RAG问答流水线 (RAG Chain)\n",
    "# ------------------------------------------------------------------------------\n",
    "# 这是最后一步，我们将所有组件用“管道”连接起来，打造自动化问答系统。\n",
    "################################################################################\n",
    "print(\"\\n🚀 Part 5: 开始构建并运行完整的RAG链...\")\n",
    "\n",
    "# --- 导入本部分所需的库 ---\n",
    "\n",
    "# PromptTemplate: 用于创建带有占位符的“任务指令模板”。\n",
    "from langchain.prompts import PromptTemplate\n",
    "\n",
    "# RunnablePassthrough, StrOutputParser: LangChain表达式语言(LCEL)中的“管道组件”。\n",
    "from langchain.schema.runnable import RunnablePassthrough\n",
    "from langchain.schema.output_parser import StrOutputParser\n",
    "\n",
    "\n",
    "\n",
    "# --- 定义“任务指令模板” ---\n",
    "# 这个模板告诉LLM它的角色和任务：根据提供的上下文回答问题。\n",
    "# template = \"\"\"\n",
    "# [SYSTEM]\n",
    "# 你是一个智能问答助手。请根据下面提供的“上下文信息”来回答用户提出的“问题”。\n",
    "# 你的回答应该清晰、简洁，并且完全基于上下文。\n",
    "# 如果你在上下文中找不到问题的答案，请直接说“根据我所掌握的资料，无法回答这个问题”，禁止编造答案。\n",
    "# [/SYSTEM]\n",
    "\n",
    "# [CONTEXT]\n",
    "# {context}\n",
    "# [/CONTEXT]\n",
    "\n",
    "# [QUESTION]\n",
    "# {question}\n",
    "# [/QUESTION]\n",
    "\n",
    "# ANSWER:\n",
    "# \"\"\"\n",
    "\n",
    "template = \"\"\"\n",
    "[SYSTEM]\n",
    "You are a top synthetic biology expert and an iGEM competition advisor. Please use professional, rigorous, and clear language to answer the \"question\" raised by the user based on the \"context information\" provided below.\n",
    "\n",
    "\n",
    "**Output format requirements:**\n",
    "1.  **Direct answer**: First, provide a concise paragraph that directly answers the question.\n",
    "2.  **Step-by-step plan**: Then, use a clear, numbered list (e.g., 1., 2., 3., ...) to detail the specific steps.\n",
    "3.  **No summary**: After the list, stop the answer directly without adding any extra summary, restatement, or phrases like \"Based on the provided information...\".\n",
    "\n",
    "Your answer must be entirely based on the provided context information. If you cannot answer, please state so clearly. [/SYSTEM]\n",
    "\n",
    "[CONTEXT]\n",
    "{context}\n",
    "[/CONTEXT]\n",
    "\n",
    "[QUESTION]\n",
    "{question}\n",
    "[/QUESTION]\n",
    "\n",
    "ANSWER:\n",
    "\"\"\"\n",
    "# 将字符串模板转换为LangChain的PromptTemplate对象。\n",
    "prompt = PromptTemplate.from_template(template)\n",
    "\n",
    "# --- 使用LCEL `|` 符号定义流水线 ---\n",
    "# 这就是RAG的核心逻辑。数据会像水一样在管道中从左向右流动。\n",
    "rag_chain = (\n",
    "    # 步骤1: 输入是一个问题(question)。这个字典结构会并行执行。\n",
    "    # - \"context\": 将问题传给检索器(compression_retriever)，获取相关文档列表。\n",
    "    # - \"question\": 使用RunnablePassthrough()，将原始问题原封不动地传递下去。\n",
    "    # 这一步的输出是一个字典: {\"context\": [文档1, 文档2, ...], \"question\": \"用户的原始问题\"}\n",
    "    {\"context\": compression_retriever, \"question\": RunnablePassthrough()}\n",
    "    \n",
    "    # 步骤2: 将上一步输出的字典传递给prompt模板。\n",
    "    # 模板中的{context}和{question}占位符会被字典中对应的值填充。\n",
    "    # 这一步的输出是一个被完整填充的、可以直接发给LLM的字符串。\n",
    "    | prompt\n",
    "    \n",
    "    # 步骤3: 将填充好的prompt字符串发送给我们准备好的大语言模型(llm)。\n",
    "    # llm会进行思考并生成回答。\n",
    "    | llm\n",
    "    \n",
    "    # 步骤4: 使用StrOutputParser将llm的输出（可能是一个复杂的对象）清理成一个干净的字符串。\n",
    "    | StrOutputParser()\n",
    ")\n",
    "\n",
    "print(\"✅ 完整的RAG链构建成功！\")\n",
    "\n",
    "# --- 开始提问！ ---\n",
    "print(\"\\n💬 开始进行合成生物学实验方案设计问答...\")\n",
    "\n",
    "# 问题1: 问一个具体的情节细节\n",
    "question1 = \"What factors must be considered when designing plant-specific genetic circuits that are not required for microbial systems?\"\n",
    "response1 = rag_chain.invoke(question1)\n",
    "print(f\"\\n🤔 问题: {question1}\")\n",
    "print(f\"🤖 回答: {response1}\")\n",
    "\n",
    "# 问题2: 问一个开放性的问题\n",
    "question2 = \"What unique challenges are faced in the design of bioreactors for plant cell culture?\"\n",
    "response2 = rag_chain.invoke(question2)\n",
    "print(f\"\\n🤔 问题: {question2}\")\n",
    "print(f\"🤖 回答: {response2}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rag_main",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}