文章摘要 FakeGPT
加载中...|
概述
人类对话能够记住之前的内容,AI Agent 也需要记忆能力才能进行有效的多轮对话。本文将深入探讨 Agent 的记忆机制、上下文管理策略,以及如何构建有记忆的智能 Agent。
Agent 的记忆类型
记忆分类
text
┌─────────────────────────────────────────────────────────┐
│ Agent 记忆类型 │
├─────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────┐ │
│ │ 短期记忆 (Short-term) │ │
│ │ • 当前对话上下文 │ │
│ │ • 存储在内存中 │ │
│ │ • 会话结束后丢失 │ │
│ │ • 容量有限(Token 限制) │ │
│ └─────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────┐ │
│ │ 长期记忆 (Long-term) │ │
│ │ • 历史对话记录 │ │
│ │ • 持久化存储 │ │
│ │ • 跨会话保持 │ │
│ │ • 容量无限 │ │
│ └─────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────┐ │
│ │ 工作记忆 (Working Memory) │ │
│ │ • 任务相关状态 │ │
│ │ • 中间计算结果 │ │
│ │ • 临时变量 │ │
│ │ • 任务完成后清除 │ │
│ └─────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────┐ │
│ │ 语义记忆 (Semantic Memory) │ │
│ │ • 知识库 │ │
│ │ • 向量存储 │ │
│ │ • 语义搜索 │ │
│ └─────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────┘记忆层次对比
| 记忆类型 | 存储位置 | 持久化 | 容量 | 访问速度 | 使用场景 |
|---|---|---|---|---|---|
| 短期记忆 | 内存 | ❌ | 受 Token 限制 | 快 | 当前对话 |
| 长期记忆 | 数据库 | ✅ | 无限 | 中 | 历史记录 |
| 工作记忆 | 内存 | ❌ | 小 | 快 | 任务状态 |
| 语义记忆 | 向量库 | ✅ | 大 | 中 | 知识检索 |
上下文窗口与 Token 管理
上下文窗口限制
text
┌─────────────────────────────────────────────────────────┐
│ 上下文窗口管理 │
├─────────────────────────────────────────────────────────┤
│ │
│ 模型上下文窗口: │
│ • GPT-3.5-turbo: 16K tokens │
│ • GPT-4o: 128K tokens │
│ • GPT-4-turbo: 128K tokens │
│ • Claude 3.5: 200K tokens │
│ │
│ Token 组成: │
│ ┌──────────────────────────────────────────────────┐ │
│ │ System Prompt │ History │ User Msg │ Output │ │
│ │ 500 │ 5000 │ 500 │ 7000 │ │
│ └──────────────────────────────────────────────────┘ │
│ ↓ 总共 13000 tokens │
│ │
│ 当超过限制时: │
│ • 截断旧消息 │
│ • 总结压缩 │
│ • 使用向量检索 │
│ │
└─────────────────────────────────────────────────────────┘Token 计算与监控
python
import tiktoken
def count_tokens(text: str, model: str = "gpt-4o") -> int:
"""计算文本的 token 数量"""
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def count_messages_tokens(messages: list, model: str = "gpt-4o") -> int:
"""计算消息列表的总 token 数"""
encoding = tiktoken.encoding_for_model(model)
tokens_per_message = 3
tokens_per_name = 1
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3 # 每个回复的固定 token
return num_tokens
# 使用示例
messages = [
{"role": "system", "content": "你是一个助手"},
{"role": "user", "content": "你好"},
{"role": "assistant", "content": "你好!有什么可以帮你的?"},
{"role": "user", "content": "介绍一下你自己"}
]
total_tokens = count_messages_tokens(messages)
print(f"总 token 数: {total_tokens}")滑动窗口策略
python
class SlidingWindowMemory:
"""滑动窗口记忆管理"""
def __init__(self, max_tokens: int = 4000, model: str = "gpt-4o"):
self.max_tokens = max_tokens
self.model = model
self.messages = []
def add_message(self, role: str, content: str):
"""添加消息"""
self.messages.append({"role": role, "content": content})
self._trim()
def _trim(self):
"""修剪消息以适应 token 限制"""
# 保留系统消息
system_messages = [m for m in self.messages if m["role"] == "system"]
other_messages = [m for m in self.messages if m["role"] != "system"]
# 从最旧的消息开始删除
while len(other_messages) > 0:
total_tokens = count_messages_tokens(
system_messages + other_messages,
self.model
)
if total_tokens <= self.max_tokens:
break
# 删除最旧的非系统消息
other_messages.pop(0)
self.messages = system_messages + other_messages
def get_messages(self) -> list:
"""获取当前消息列表"""
return self.messages.copy()
# 使用
memory = SlidingWindowMemory(max_tokens=4000)
# 添加对话
memory.add_message("system", "你是一个Python专家")
memory.add_message("user", "什么是装饰器?")
memory.add_message("assistant", "装饰器是...")
# ... 添加更多对话
# 自动修剪后获取
messages = memory.get_messages()记忆压缩策略
1. 摘要压缩
python
from openai import OpenAI
import json
class SummaryMemory:
"""摘要记忆管理"""
def __init__(self, max_tokens: int = 3000, summary_tokens: int = 500):
self.max_tokens = max_tokens
self.summary_tokens = summary_tokens
self.messages = []
self.summary = ""
def add_message(self, role: str, content: str):
"""添加消息"""
self.messages.append({"role": role, "content": content})
self._compress_if_needed()
def _compress_if_needed(self):
"""如果需要则压缩"""
current_tokens = count_messages_tokens(self.messages)
if current_tokens > self.max_tokens:
self._create_summary()
def _create_summary(self):
"""创建对话摘要"""
client = OpenAI()
# 找到分界点(保留最近几轮)
split_point = len(self.messages) // 2
old_messages = self.messages[:split_point]
recent_messages = self.messages[split_point:]
# 生成摘要
prompt = f"""请用不超过 {self.summary_tokens} 个 token 总结以下对话:
{json.dumps(old_messages, ensure_ascii=False)}
只返回摘要内容,不要其他解释。"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
self.summary = response.choices[0].message.content
self.messages = [
{"role": "system", "content": f"之前的对话摘要:{self.summary}"}
] + recent_messages
# 使用
memory = SummaryMemory(max_tokens=4000)
# 添加多轮对话
for i in range(20):
memory.add_message("user", f"问题{i}")
memory.add_message("assistant", f"回答{i}")
print(len(memory.get_messages())) # 自动压缩2. 分层存储
python
from enum import Enum
from datetime import datetime
class MessagePriority(Enum):
CRITICAL = 1 # 系统提示、重要指令
HIGH = 2 # 最近几轮对话
MEDIUM = 3 # 重要历史信息
LOW = 4 # 普通历史信息
class HierarchicalMemory:
"""分层记忆管理"""
def __init__(self):
self.layers = {
MessagePriority.CRITICAL: [],
MessagePriority.HIGH: [],
MessagePriority.MEDIUM: [],
MessagePriority.LOW: []
}
def add_message(self, role: str, content: str, priority: MessagePriority):
"""添加消息到指定层级"""
self.layers[priority].append({
"role": role,
"content": content,
"timestamp": datetime.now()
})
def get_messages(self, max_tokens: int) -> list:
"""按优先级获取消息"""
messages = []
current_tokens = 0
# 按优先级顺序获取
for priority in MessagePriority:
for msg in self.layers[priority]:
msg_tokens = count_tokens(msg["content"])
if current_tokens + msg_tokens > max_tokens:
break
messages.append(msg)
current_tokens += msg_tokens
return messages
# 使用
memory = HierarchicalMemory()
# 添加不同优先级的消息
memory.add_message(
"system",
"你是专业的Python助手",
MessagePriority.CRITICAL
)
memory.add_message(
"user",
"最近我问过关于装饰器的问题",
MessagePriority.MEDIUM
)
memory.add_message(
"user",
"现在我想了解生成器",
MessagePriority.HIGH
)
messages = memory.get_messages(max_tokens=2000)3. 关键信息提取
python
class KeyInfoMemory:
"""关键信息提取记忆"""
def __init__(self):
self.conversation = []
self.key_facts = []
def add_message(self, role: str, content: str):
"""添加消息"""
self.conversation.append({"role": role, "content": content})
# 每几轮提取一次关键信息
if len(self.conversation) % 6 == 0:
self._extract_key_facts()
def _extract_key_facts(self):
"""提取关键事实"""
client = OpenAI()
prompt = f"""从以下对话中提取关键事实信息,以JSON数组返回:
格式: {{"fact": "事实描述", "category": "类别"}}
对话:
{self.conversation}
只返回JSON,不要其他内容。"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
import json
facts = json.loads(response.choices[0].message.content)
self.key_facts.extend(facts.get("facts", []))
def get_summary_context(self) -> str:
"""获取摘要上下文"""
if not self.key_facts:
return ""
return "对话中的关键信息:\n" + "\n".join([
f"- {fact['fact']}" for fact in self.key_facts[-10:]
])向量存储作为长期记忆
记忆向量化存储
python
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import json
class VectorMemory:
"""基于向量的长期记忆"""
def __init__(self, persist_directory="./memory_db"):
self.embeddings = OpenAIEmbeddings()
self.vectorstore = Chroma(
persist_directory=persist_directory,
embedding_function=self.embeddings
)
self.llm = ChatOpenAI(model="gpt-4o")
def save_memory(self, content: str, metadata: dict = None):
"""保存记忆"""
doc = Document(
page_content=content,
metadata=metadata or {}
)
self.vectorstore.add_documents([doc])
def save_conversation_turn(self, user_msg: str, assistant_msg: str):
"""保存一轮对话"""
# 分别保存用户和助手消息
self.save_memory(
f"用户说: {user_msg}",
{"type": "user", "timestamp": datetime.now().isoformat()}
)
self.save_memory(
f"助手回答: {assistant_msg}",
{"type": "assistant", "timestamp": datetime.now().isoformat()}
)
# 保存对话对
self.save_memory(
f"对话: 用户问'{user_msg}',助手答'{assistant_msg}'",
{"type": "conversation", "timestamp": datetime.now().isoformat()}
)
def recall(self, query: str, k: int = 3) -> list:
"""回忆相关记忆"""
results = self.vectorstore.similarity_search(query, k=k)
return [doc.page_content for doc in results]
def recall_with_scores(self, query: str, k: int = 3) -> list:
"""回忆相关记忆(带分数)"""
results = self.vectorstore.similarity_search_with_score(query, k=k)
return [
{"content": doc.page_content, "score": score}
for doc, score in results
]
# 使用
memory = VectorMemory()
# 保存对话
memory.save_conversation_turn(
"我叫小明,今年25岁,是一名程序员",
"好的小明,很高兴认识你!"
)
memory.save_conversation_turn(
"我喜欢用Python编程",
"Python是门很棒的语言!"
)
# 回忆相关记忆
query = "我叫什么名字?"
recalled = memory.recall(query, k=2)
print(f"相关记忆: {recalled}")记忆检索策略
python
class MemoryRetriever:
"""记忆检索器"""
def __init__(self, vector_memory: VectorMemory):
self.memory = vector_memory
self.llm = ChatOpenAI(model="gpt-4o")
def recall_with_rewriting(self, query: str) -> list:
"""查询重写后召回"""
# 生成多个查询变体
prompt = f"""生成3个不同方式的查询来查找相关信息:
原查询: {query}
只返回查询,每行一个。"""
response = self.llm.invoke(prompt)
queries = [query] + response.content.split('\n')
# 合并去重结果
all_results = {}
for q in queries:
results = self.memory.recall(q.strip(), k=2)
for r in results:
all_results[r] = all_results.get(r, 0) + 1
# 按频率排序
sorted_results = sorted(
all_results.items(),
key=lambda x: x[1],
reverse=True
)
return [r[0] for r in sorted_results[:5]]
def recall_with_filter(self, query: str, filter_type: str = None) -> list:
"""带过滤的召回"""
# 简化实现,实际可以用向量库的元数据过滤
all_results = self.memory.recall_with_scores(query, k=10)
if filter_type:
filtered = [
r for r in all_results
if filter_type.lower() in r["content"].lower()
]
return filtered
return all_results
def temporal_recall(self, query: str, recent_first: bool = True) -> list:
"""时间感知的召回"""
results = self.memory.recall_with_scores(query, k=10)
# 简化:根据内容中的时间信息排序
# 实际应该使用元数据中的时间戳
return results多轮对话状态管理
对话状态机
python
from enum import Enum
from typing import Optional, Dict, Any
class DialogueState(Enum):
GREETING = "greeting"
INTENT_RECOGNITION = "intent_recognition"
SLOT_FILLING = "slot_filling"
ACTION_EXECUTION = "action_execution"
CLOSING = "closing"
class DialogueStateManager:
"""对话状态管理器"""
def __init__(self):
self.current_state = DialogueState.GREETING
self.slots = {} # 槽位存储
self.history = [] # 状态历史
self.context = {} # 上下文信息
def transition(self, new_state: DialogueState, **context):
"""状态转换"""
self.history.append(self.current_state)
self.current_state = new_state
self.context.update(context)
def get_state(self) -> DialogueState:
"""获取当前状态"""
return self.current_state
def set_slot(self, key: str, value: Any):
"""设置槽位"""
self.slots[key] = value
def get_slot(self, key: str, default=None) -> Any:
"""获取槽位"""
return self.slots.get(key, default)
def get_missing_slots(self, required_slots: list) -> list:
"""获取缺失的必需槽位"""
return [s for s in required_slots if s not in self.slots]
def reset(self):
"""重置对话"""
self.current_state = DialogueState.GREETING
self.slots = {}
self.history = []
self.context = {}
# 使用示例
class BookingBot:
"""预订机器人"""
def __init__(self):
self.state_manager = DialogueStateManager()
self.required_slots = ["destination", "date", "people"]
def process(self, user_input: str) -> str:
"""处理用户输入"""
state = self.state_manager.get_state()
if state == DialogueState.GREETING:
self.state_manager.transition(DialogueState.INTENT_RECOGNITION)
return "你好!我可以帮你预订餐厅。请问你想去哪里用餐?"
elif state == DialogueState.INTENT_RECOGNITION:
# 简化:直接进入槽位填充
self.state_manager.transition(DialogueState.SLOT_FILLING)
# 尝试提取槽位
self._extract_slots(user_input)
missing = self.state_manager.get_missing_slots(self.required_slots)
if "destination" in missing:
return "请问你想预订哪家餐厅?"
elif "date" in missing:
return "请问你想预订什么时间?"
elif "people" in missing:
return "请问有几位用餐?"
elif state == DialogueState.SLOT_FILLING:
self._extract_slots(user_input)
missing = self.state_manager.get_missing_slots(self.required_slots)
if missing:
if "destination" in missing:
return "请问你想预订哪家餐厅?"
elif "date" in missing:
return "请问你想预订什么时间?"
elif "people" in missing:
return "请问有几位用餐?"
else:
self.state_manager.transition(DialogueState.ACTION_EXECUTION)
return self._execute_booking()
elif state == DialogueState.ACTION_EXECUTION:
self.state_manager.transition(DialogueState.CLOSING)
return f"预订成功!{self._get_booking_summary()}"
elif state == DialogueState.CLOSING:
self.state_manager.reset()
return "还有其他需要帮助的吗?"
def _extract_slots(self, text: str):
"""提取槽位(简化)"""
# 实际应用中应该使用 NER 或 LLM 提取
if "餐厅" in text or "海底捞" in text:
self.state_manager.set_slot("destination", text)
if "今天" in text or "明天" in text or any(c.isdigit() for c in text):
self.state_manager.set_slot("date", text)
if any(c.isdigit() for c in text):
import re
numbers = re.findall(r'\d+', text)
if numbers:
self.state_manager.set_slot("people", numbers[0])
def _execute_booking(self) -> str:
"""执行预订"""
return f"好的,已为你预订 {self.state_manager.get_slot('destination')}"
def _get_booking_summary(self) -> str:
"""获取预订摘要"""
return f"餐厅:{self.state_manager.get_slot('destination')}"LangChain 记忆组件
python
from langchain.memory import (
ConversationBufferMemory,
ConversationBufferWindowMemory,
ConversationSummaryMemory,
ConversationKGMemory,
VectorStoreMemory,
CombinedMemory
)
from langchain.chains import ConversationChain
from langchain_openai import ChatOpenAI
# 1. 缓冲记忆
buffer_memory = ConversationBufferMemory(
return_messages=True,
human_prefix="用户",
ai_prefix="助手"
)
# 2. 窗口记忆
window_memory = ConversationBufferWindowMemory(
k=3, # 保留最近3轮
return_messages=True
)
# 3. 摘要记忆
summary_memory = ConversationSummaryMemory(
llm=ChatOpenAI(model="gpt-4o"),
return_messages=True
)
# 4. 知识图谱记忆
kg_memory = ConversationKGMemory(
llm=ChatOpenAI(model="gpt-4o")
)
# 5. 向量记忆
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
vectorstore = Chroma(embedding_function=OpenAIEmbeddings())
vector_memory = VectorStoreMemory(
vectorstore=vectorstore,
k=3 # 检索 top-3
)
# 6. 组合记忆
class CombinedMemory:
"""组合多种记忆类型"""
def __init__(self):
self.short_term = ConversationBufferWindowMemory(k=5)
self.long_term = ConversationSummaryMemory(
llm=ChatOpenAI(model="gpt-4o")
)
def load_memory_variables(self, inputs):
"""加载记忆变量"""
short = self.short_term.load_memory_variables(inputs)
long = self.long_term.load_memory_variables(inputs)
return {
"short_term_history": short.get("history", ""),
"long_term_summary": long.get("history", "")
}
def save_context(self, inputs, outputs):
"""保存上下文"""
self.short_term.save_context(inputs, outputs)
self.long_term.save_context(inputs, outputs)
# 使用
memory = CombinedMemory()
chain = ConversationChain(
llm=ChatOpenAI(model="gpt-4o"),
memory=memory,
verbose=True
)实战:构建有记忆的 Agent
python
from openai import OpenAI
from typing import List, Dict, Optional
import json
from datetime import datetime
class MemorableAgent:
"""有记忆的 Agent"""
def __init__(self, name: str = "Assistant"):
self.name = name
self.client = OpenAI()
self.short_term = [] # 短期记忆
self.long_term = VectorMemory() # 长期记忆
self.working_memory = {} # 工作记忆
# 初始化系统提示
self.system_prompt = f"""你是 {self.name},一个智能助手。
你有三种记忆:
1. 短期记忆:当前对话的上下文
2. 长期记忆:存储重要的历史信息
3. 工作记忆:当前任务的相关状态
使用这些记忆来提供连贯的对话体验。"""
def chat(self, user_message: str) -> str:
"""处理用户消息"""
# 1. 检索相关长期记忆
recalled = self.long_term.recall(user_message, k=3)
memory_context = "\n".join([f"- {m}" for m in recalled]) if recalled else "无相关记忆"
# 2. 构建消息列表
messages = [
{"role": "system", "content": self.system_prompt},
{"role": "system", "content": f"相关记忆:\n{memory_context}"},
*self.short_term,
{"role": "user", "content": user_message}
]
# 3. 调用 LLM
response = self.client.chat.completions.create(
model="gpt-4o",
messages=messages
)
assistant_message = response.choices[0].message.content
# 4. 更新短期记忆
self.short_term.append({"role": "user", "content": user_message})
self.short_term.append({"role": "assistant", "content": assistant_message})
# 保持短期记忆在限制内
self._trim_short_term()
# 5. 提取并保存重要信息到长期记忆
self._extract_and_save_facts(user_message, assistant_message)
return assistant_message
def _trim_short_term(self, max_tokens: int = 3000):
"""修剪短期记忆"""
while count_messages_tokens(self.short_term) > max_tokens:
# 移除最旧的一轮对话(保留系统消息)
if len(self.short_term) >= 2:
self.short_term.pop(0)
self.short_term.pop(0)
else:
break
def _extract_and_save_facts(self, user_msg: str, assistant_msg: str):
"""提取并保存重要事实"""
# 每几轮对话提取一次
if len(self.short_term) % 12 != 0:
return
prompt = f"""从以下对话中提取值得长期保存的重要信息(如用户偏好、重要事实等):
用户: {user_msg}
助手: {assistant_msg}
只返回重要事实,每行一条。如果没有重要信息,返回"无"。"""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
facts = response.choices[0].message.content
if facts and facts.lower() != "无":
# 保存到长期记忆
for fact in facts.split('\n'):
fact = fact.strip()
if fact:
self.long_term.save_memory(
fact,
{"type": "fact", "timestamp": datetime.now().isoformat()}
)
def set_working_state(self, key: str, value: any):
"""设置工作记忆状态"""
self.working_memory[key] = value
def get_working_state(self, key: str) -> any:
"""获取工作记忆状态"""
return self.working_memory.get(key)
def clear_working_memory(self):
"""清空工作记忆"""
self.working_memory = {}
def reset_conversation(self):
"""重置对话(保留长期记忆)"""
self.short_term = []
self.working_memory = {}
def save_session(self):
"""保存会话"""
# 将短期对话保存到长期记忆
for i in range(0, len(self.short_term), 2):
if i + 1 < len(self.short_term):
user_msg = self.short_term[i]["content"]
assistant_msg = self.short_term[i + 1]["content"]
self.long_term.save_conversation_turn(user_msg, assistant_msg)
self.short_term = []
# 使用示例
agent = MemorableAgent("小助手")
# 对话
print(agent.chat("我叫小明,今年25岁"))
# 输出: 你好小明,很高兴认识你!
print(agent.chat("我刚才说我多大了?"))
# 输出: 你说你今年25岁。
print(agent.chat("我喜欢用Python编程"))
# 输出: Python是一门很棒的编程语言...
# 保存会话
agent.save_session()
# 重置后测试
agent.reset_conversation()
print(agent.chat("我叫什么名字?"))
# 如果记忆被正确保存,应该能够回忆起小结
记忆是 Agent 能够进行有效多轮对话的关键:
核心要点
记忆类型
- 短期记忆:当前对话上下文
- 长期记忆:持久化历史信息
- 工作记忆:任务状态
- 语义记忆:知识库
Token 管理
- 监控 token 使用量
- 滑动窗口策略
- 自动修剪旧消息
记忆压缩
- 摘要压缩减少 token
- 分层存储按优先级管理
- 关键信息提取
向量存储
- 语义搜索相关记忆
- 查询重写优化召回
- 时间感知的记忆检索
状态管理
- 对话状态机
- 槽位填充
- LangChain 记忆组件
下一篇文章将介绍 LangChain 框架实战。
赞赏博主
评论 隐私政策