Agent Loses All State After Container Restart — Ephemeral Memory Problem
Symptom
- Container restarts (OOM kill, deploy, crash) and all in-progress tasks are lost
- User resumes conversation but agent has no memory of previous turns
- Agent loses user preferences on every restart — always starts from defaults
- Task at step 7 of 10 — container restarts — must restart from step 1
dict,list, and instance variable state is always empty on startup- Long-running agent job loses hours of progress to a single OOM kill
Root Cause
Python in-memory data structures (dicts, lists, class attributes) exist only in the process. When the process dies — container restart, OOM kill, deploy, timeout — all in-memory state is lost. Containers are designed to be ephemeral. Any state that matters must be written to persistent storage before the process exits.
Fix
Option 1: Redis for session state (fast, shared across replicas)
import redis
import json
import os
from typing import Any
class RedisStateStore:
"""
Persist agent state to Redis.
Survives container restarts, works across multiple replicas.
"""
def __init__(self, session_id: str, ttl_seconds: int = 86400):
self.r = redis.Redis(
host=os.environ.get("REDIS_HOST", "localhost"),
port=int(os.environ.get("REDIS_PORT", "6379")),
decode_responses=True
)
self.key = f"agent:session:{session_id}"
self.ttl = ttl_seconds
def get(self, field: str, default: Any = None) -> Any:
"""Get a single state field"""
value = self.r.hget(self.key, field)
if value is None:
return default
try:
return json.loads(value)
except json.JSONDecodeError:
return value
def set(self, field: str, value: Any):
"""Set a single state field"""
self.r.hset(self.key, field, json.dumps(value))
self.r.expire(self.key, self.ttl)
def get_all(self) -> dict:
"""Get entire session state"""
raw = self.r.hgetall(self.key)
return {k: json.loads(v) for k, v in raw.items()}
def update(self, updates: dict):
"""Update multiple fields atomically"""
pipe = self.r.pipeline()
for k, v in updates.items():
pipe.hset(self.key, k, json.dumps(v))
pipe.expire(self.key, self.ttl)
pipe.execute()
def delete(self):
self.r.delete(self.key)
# Usage:
session_id = "user_123_session_456"
state = RedisStateStore(session_id)
# Save progress that survives restarts:
state.set("current_step", 7)
state.set("task_goal", "Analyze Q4 sales data")
state.set("completed_steps", ["fetch_data", "clean_data", "aggregate"])
# On restart — state is still there:
step = state.get("current_step", default=1)
print(f"Resuming from step {step}") # → 7, not 1
Option 2: SQLite for local persistent state
import sqlite3
import json
from pathlib import Path
from datetime import datetime
class SQLiteStateStore:
"""
SQLite-based state persistence — works without Redis.
Mount the SQLite file on a persistent volume in Docker.
"""
def __init__(self, db_path: str = "/data/agent_state.db"):
# Ensure parent directory exists (mounted volume)
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
self.conn = sqlite3.connect(db_path, check_same_thread=False)
self.conn.execute("PRAGMA journal_mode=WAL") # Better concurrent access
self._init_schema()
def _init_schema(self):
self.conn.execute("""
CREATE TABLE IF NOT EXISTS session_state (
session_id TEXT NOT NULL,
key TEXT NOT NULL,
value TEXT NOT NULL,
updated_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY (session_id, key)
)
""")
self.conn.commit()
def set(self, session_id: str, key: str, value):
self.conn.execute("""
INSERT OR REPLACE INTO session_state (session_id, key, value, updated_at)
VALUES (?, ?, ?, ?)
""", (session_id, key, json.dumps(value), datetime.utcnow().isoformat()))
self.conn.commit()
def get(self, session_id: str, key: str, default=None):
row = self.conn.execute(
"SELECT value FROM session_state WHERE session_id=? AND key=?",
(session_id, key)
).fetchone()
return json.loads(row[0]) if row else default
def get_all(self, session_id: str) -> dict:
rows = self.conn.execute(
"SELECT key, value FROM session_state WHERE session_id=?",
(session_id,)
).fetchall()
return {k: json.loads(v) for k, v in rows}
def delete_session(self, session_id: str):
self.conn.execute("DELETE FROM session_state WHERE session_id=?", (session_id,))
self.conn.commit()
# Docker volume mount ensures SQLite file persists across container restarts:
# docker run -v /host/data:/data my-agent
store = SQLiteStateStore("/data/agent_state.db")
Option 3: Checkpoint pattern — save state before any risky operation
import json
from pathlib import Path
from datetime import datetime
class TaskCheckpointer:
"""
Save task progress at each step.
On restart, load checkpoint and resume from last known state.
"""
def __init__(self, task_id: str, checkpoint_dir: str = "/data/checkpoints"):
self.task_id = task_id
self.dir = Path(checkpoint_dir)
self.dir.mkdir(parents=True, exist_ok=True)
self.path = self.dir / f"{task_id}.json"
self._state = self._load()
def _load(self) -> dict:
if self.path.exists():
state = json.loads(self.path.read_text())
print(
f"Checkpoint loaded for {self.task_id}: "
f"step {state.get('current_step')}/{state.get('total_steps')}"
)
return state
return {}
def save(self, **kwargs):
"""Save current state — call after each significant step"""
self._state.update(kwargs)
self._state["saved_at"] = datetime.utcnow().isoformat()
# Atomic write — write to temp then rename
tmp = self.path.with_suffix(".tmp")
tmp.write_text(json.dumps(self._state, indent=2))
tmp.replace(self.path)
def get(self, key: str, default=None):
return self._state.get(key, default)
def is_step_done(self, step_name: str) -> bool:
return step_name in self._state.get("completed_steps", [])
def mark_step_done(self, step_name: str):
done = self._state.get("completed_steps", [])
if step_name not in done:
done.append(step_name)
self.save(completed_steps=done)
def clear(self):
self.path.unlink(missing_ok=True)
self._state = {}
cp = TaskCheckpointer("analysis_task_abc123")
async def run_analysis(data_sources: list):
total = len(data_sources)
start_idx = cp.get("current_index", 0) # Resume from checkpoint
if start_idx > 0:
print(f"Resuming from index {start_idx}/{total}")
for i in range(start_idx, total):
source = data_sources[i]
await process_source(source)
# Save after each item — restart resumes from next item
cp.save(current_index=i + 1, last_processed=source["id"])
cp.clear() # Clean up on success
print("Analysis complete")
Option 4: Environment variable for stateless configuration
import os
from pydantic import BaseSettings
class AgentConfig(BaseSettings):
"""
Configuration that survives restarts via environment variables.
State that should persist: use persistent storage (Redis/SQLite).
Configuration that should persist: use environment variables.
"""
# These survive restarts via env vars:
model_name: str = "claude-sonnet-4-6"
max_tokens: int = 4096
agent_persona: str = "helpful assistant"
output_dir: str = "/data/output"
# These come from secrets manager:
api_key: str
database_url: str
class Config:
env_file = ".env"
env_prefix = "AGENT_"
# Docker-compose passes env vars that survive restarts:
# services:
# agent:
# environment:
# - AGENT_MODEL_NAME=claude-sonnet-4-6
# - AGENT_OUTPUT_DIR=/data/output
config = AgentConfig()
Option 5: Graceful shutdown — flush state before exit
import signal
import asyncio
import atexit
class GracefulAgent:
"""
Agent that saves state on shutdown signals before container stops.
"""
def __init__(self, session_id: str):
self.session_id = session_id
self.state = {}
self._running = True
# Register shutdown handlers
signal.signal(signal.SIGTERM, self._on_shutdown)
signal.signal(signal.SIGINT, self._on_shutdown)
atexit.register(self._flush_state)
def _on_shutdown(self, signum, frame):
"""Called when container receives SIGTERM (graceful shutdown)"""
print(f"Shutdown signal received ({signum}). Saving state...")
self._flush_state()
self._running = False
def _flush_state(self):
"""Write current state to persistent storage"""
if self.state:
store = SQLiteStateStore()
for key, value in self.state.items():
store.set(self.session_id, key, value)
print(f"State flushed: {len(self.state)} keys saved")
def update_state(self, **kwargs):
self.state.update(kwargs)
# Optional: write-through for maximum safety
self._flush_state()
# Docker STOPSIGNAL is SIGTERM by default (15 seconds before SIGKILL)
# Agent has 15 seconds to save state on graceful shutdown
Option 6: Docker persistent volume configuration
# docker-compose.yml — persist state across container restarts
services:
agent:
image: my-agent:latest
environment:
- REDIS_HOST=redis
- STATE_DB_PATH=/data/agent_state.db
volumes:
- agent-data:/data # Persists SQLite, checkpoints, output
- agent-logs:/app/logs # Persists log files
depends_on:
- redis
redis:
image: redis:7-alpine
volumes:
- redis-data:/data # Persists Redis data (--appendonly yes)
command: redis-server --appendonly yes # Write-ahead log for durability
volumes:
agent-data:
driver: local
redis-data:
driver: local
agent-logs:
driver: local
# State in /data/ and redis-data survive: restarts, redeploys, OOM kills
# State in container memory: lost on any process exit
State Persistence Options
| Storage | Survives restart | Multi-replica | Latency | Best for |
|---|---|---|---|---|
| Python dict/list | No | No | 0ms | Never rely on it |
| SQLite + volume | Yes | Single-writer | ~1ms | Single-instance state |
| Redis | Yes | Yes | ~1ms | Multi-replica sessions |
| PostgreSQL | Yes | Yes | ~5ms | Structured state + history |
| File + volume | Yes | No | ~5ms | Checkpoints, large blobs |
| Env vars | Yes (config only) | Yes | 0ms | Configuration, not dynamic state |
Expected Token Savings
User repeats context after restart × 10 restarts: ~50,000 tokens Persistent state: user never needs to repeat context
Environment
- Any agent deployed in containers (Docker/k8s) with session state or long-running tasks
- Source: direct experience; ephemeral state is the most common production surprise when moving from local dev to containerized deployment
Wasting tokens on this error?
Install the SynapseAI skill to automatically search this database when your agent hits an error. Average savings: $2–5 per error incident.
clawhub install synapse-ai
Solved an error that's not here?
Share it and earn MoltCoin rewards.