Complete documentation for future sessions
- CLAUDE.md for AI agents to understand the codebase - GITEA-GUIDE.md centralizes all Gitea operations (API, Registry, Auth) - DEVELOPMENT-WORKFLOW.md explains complete dev process - ROADMAP.md, NEXT-SESSION.md for planning - QUICK-REFERENCE.md, TROUBLESHOOTING.md for daily use - 40+ detailed docs in /docs folder - Backend as submodule from Gitea Everything documented for autonomous operation. Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
This commit is contained in:
452
docs/05-agents/ciclo-vida.md
Normal file
452
docs/05-agents/ciclo-vida.md
Normal file
@@ -0,0 +1,452 @@
|
||||
# Ciclo de Vida de los Agentes
|
||||
|
||||
## Estados del Agente
|
||||
|
||||
```
|
||||
┌──────────────┐
|
||||
│ Initializing │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────┐ ┌──────┐
|
||||
│ Idle │◄───►│ Busy │
|
||||
└───┬──┘ └──┬───┘
|
||||
│ │
|
||||
│ │
|
||||
▼ ▼
|
||||
┌───────┐ ┌───────┐
|
||||
│ Error │ │Offline│
|
||||
└───────┘ └───────┘
|
||||
```
|
||||
|
||||
## Inicialización
|
||||
|
||||
### 1. Creación del Pod
|
||||
|
||||
```typescript
|
||||
// Backend crea el pod
|
||||
const agentManager = new AgentManager()
|
||||
const agent = await agentManager.createAgent(['javascript', 'react'])
|
||||
|
||||
// Resultado
|
||||
{
|
||||
id: 'agent-abc123',
|
||||
podName: 'claude-agent-abc123',
|
||||
namespace: 'agents',
|
||||
status: 'initializing'
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Arranque del Contenedor
|
||||
|
||||
```bash
|
||||
# En el pod (entrypoint.sh)
|
||||
echo "🤖 Starting agent: $AGENT_ID"
|
||||
|
||||
# 1. Setup SSH
|
||||
echo "$GIT_SSH_KEY" > /root/.ssh/id_ed25519
|
||||
chmod 600 /root/.ssh/id_ed25519
|
||||
|
||||
# 2. Configure Claude Code MCP
|
||||
cat > /root/.claude-code/config.json <<EOF
|
||||
{
|
||||
"mcpServers": {
|
||||
"aiworker": {
|
||||
"url": "$MCP_SERVER_URL"
|
||||
}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
# 3. Send initial heartbeat
|
||||
curl -X POST "$MCP_SERVER_URL/heartbeat" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Agent-ID: $AGENT_ID" \
|
||||
-d '{"status":"idle"}'
|
||||
|
||||
# 4. Start work loop
|
||||
exec /usr/local/bin/agent-loop.sh
|
||||
```
|
||||
|
||||
### 3. Registro en el Sistema
|
||||
|
||||
```typescript
|
||||
// Backend detecta el heartbeat y actualiza
|
||||
await db.update(agents)
|
||||
.set({
|
||||
status: 'idle',
|
||||
lastHeartbeat: new Date(),
|
||||
})
|
||||
.where(eq(agents.id, agentId))
|
||||
|
||||
logger.info(`Agent ${agentId} is now active`)
|
||||
```
|
||||
|
||||
## Asignación de Tarea
|
||||
|
||||
### 1. Agent Polling
|
||||
|
||||
```bash
|
||||
# agent-loop.sh
|
||||
while true; do
|
||||
echo "📋 Checking for tasks..."
|
||||
|
||||
TASK=$(curl -s -X POST "$MCP_SERVER_URL/tools/call" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"name\": \"get_next_task\",
|
||||
\"arguments\": {\"agentId\": \"$AGENT_ID\"}
|
||||
}")
|
||||
|
||||
TASK_ID=$(echo "$TASK" | jq -r '.content[0].text | fromjson | .task.id // empty')
|
||||
|
||||
if [ -n "$TASK_ID" ]; then
|
||||
echo "🎯 Got task: $TASK_ID"
|
||||
process_task "$TASK_ID"
|
||||
else
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
```
|
||||
|
||||
### 2. Backend Asigna Tarea
|
||||
|
||||
```typescript
|
||||
// services/mcp/handlers.ts - getNextTask()
|
||||
async function getNextTask(args: { agentId: string }) {
|
||||
// 1. Buscar siguiente tarea en backlog
|
||||
const task = await db.query.tasks.findFirst({
|
||||
where: eq(tasks.state, 'backlog'),
|
||||
orderBy: [desc(tasks.priority), asc(tasks.createdAt)],
|
||||
})
|
||||
|
||||
if (!task) {
|
||||
return { content: [{ type: 'text', text: JSON.stringify({ message: 'No tasks' }) }] }
|
||||
}
|
||||
|
||||
// 2. Asignar al agente
|
||||
await db.update(tasks)
|
||||
.set({
|
||||
state: 'in_progress',
|
||||
assignedAgentId: args.agentId,
|
||||
assignedAt: new Date(),
|
||||
startedAt: new Date(),
|
||||
})
|
||||
.where(eq(tasks.id, task.id))
|
||||
|
||||
// 3. Actualizar agente
|
||||
await db.update(agents)
|
||||
.set({
|
||||
status: 'busy',
|
||||
currentTaskId: task.id,
|
||||
})
|
||||
.where(eq(agents.id, args.agentId))
|
||||
|
||||
// 4. Retornar tarea
|
||||
return {
|
||||
content: [{
|
||||
type: 'text',
|
||||
text: JSON.stringify({ task }),
|
||||
}],
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Trabajo en Tarea
|
||||
|
||||
### Fase 1: Setup
|
||||
|
||||
```bash
|
||||
# Clone repo
|
||||
git clone "$PROJECT_REPO" "/workspace/task-$TASK_ID"
|
||||
cd "/workspace/task-$TASK_ID"
|
||||
|
||||
# Create branch (via MCP)
|
||||
curl -X POST "$MCP_SERVER_URL/tools/call" \
|
||||
-d "{\"name\": \"create_branch\", \"arguments\": {\"taskId\": \"$TASK_ID\"}}"
|
||||
|
||||
# Checkout branch
|
||||
git fetch origin
|
||||
git checkout "$BRANCH_NAME"
|
||||
```
|
||||
|
||||
### Fase 2: Implementación
|
||||
|
||||
```bash
|
||||
# Start Claude Code session
|
||||
claude-code chat --message "
|
||||
I need you to work on this task:
|
||||
|
||||
Title: $TASK_TITLE
|
||||
Description: $TASK_DESC
|
||||
|
||||
Instructions:
|
||||
1. Analyze the codebase
|
||||
2. Implement the changes
|
||||
3. Write tests
|
||||
4. Commit with clear messages
|
||||
5. Use MCP tools when done
|
||||
|
||||
Start working now.
|
||||
"
|
||||
```
|
||||
|
||||
### Fase 3: Preguntas (opcional)
|
||||
|
||||
```typescript
|
||||
// Si el agente necesita info
|
||||
await mcp.callTool('ask_user_question', {
|
||||
taskId,
|
||||
question: 'Should I add TypeScript types?',
|
||||
context: 'The codebase is in JavaScript...',
|
||||
})
|
||||
|
||||
// Cambiar estado a needs_input
|
||||
await mcp.callTool('update_task_status', {
|
||||
taskId,
|
||||
status: 'needs_input',
|
||||
})
|
||||
|
||||
// Hacer polling cada 5s hasta respuesta
|
||||
let response
|
||||
while (!response) {
|
||||
await sleep(5000)
|
||||
const check = await mcp.callTool('check_question_response', { taskId })
|
||||
if (check.hasResponse) {
|
||||
response = check.response
|
||||
}
|
||||
}
|
||||
|
||||
// Continuar con la respuesta
|
||||
await mcp.callTool('update_task_status', {
|
||||
taskId,
|
||||
status: 'in_progress',
|
||||
})
|
||||
```
|
||||
|
||||
### Fase 4: Finalización
|
||||
|
||||
```bash
|
||||
# Create PR
|
||||
curl -X POST "$MCP_SERVER_URL/tools/call" \
|
||||
-d "{
|
||||
\"name\": \"create_pull_request\",
|
||||
\"arguments\": {
|
||||
\"taskId\": \"$TASK_ID\",
|
||||
\"title\": \"$TASK_TITLE\",
|
||||
\"description\": \"Implemented feature X...\"
|
||||
}
|
||||
}"
|
||||
|
||||
# Deploy preview
|
||||
curl -X POST "$MCP_SERVER_URL/tools/call" \
|
||||
-d "{
|
||||
\"name\": \"trigger_preview_deploy\",
|
||||
\"arguments\": {\"taskId\": \"$TASK_ID\"}
|
||||
}"
|
||||
|
||||
# Update status
|
||||
curl -X POST "$MCP_SERVER_URL/tools/call" \
|
||||
-d "{
|
||||
\"name\": \"update_task_status\",
|
||||
\"arguments\": {
|
||||
\"taskId\": \"$TASK_ID\",
|
||||
\"status\": \"ready_to_test\"
|
||||
}
|
||||
}"
|
||||
```
|
||||
|
||||
## Liberación del Agente
|
||||
|
||||
```typescript
|
||||
// Cuando tarea completa (ready_to_test o completed)
|
||||
await db.update(agents)
|
||||
.set({
|
||||
status: 'idle',
|
||||
currentTaskId: null,
|
||||
tasksCompleted: sql`tasks_completed + 1`,
|
||||
})
|
||||
.where(eq(agents.id, agentId))
|
||||
|
||||
logger.info(`Agent ${agentId} completed task ${taskId}, now idle`)
|
||||
```
|
||||
|
||||
## Manejo de Errores
|
||||
|
||||
### Timeout de Tarea
|
||||
|
||||
```bash
|
||||
# agent-loop.sh con timeout
|
||||
timeout 7200 claude-code chat --message "$TASK_PROMPT" || {
|
||||
STATUS=$?
|
||||
if [ $STATUS -eq 124 ]; then
|
||||
echo "⏰ Task timeout after 2 hours"
|
||||
|
||||
# Notify backend
|
||||
curl -X POST "$MCP_SERVER_URL/tools/call" \
|
||||
-d "{
|
||||
\"name\": \"update_task_status\",
|
||||
\"arguments\": {
|
||||
\"taskId\": \"$TASK_ID\",
|
||||
\"status\": \"needs_input\",
|
||||
\"metadata\": {\"reason\": \"timeout\"}
|
||||
}
|
||||
}"
|
||||
|
||||
# Log error
|
||||
curl -X POST "$MCP_SERVER_URL/tools/call" \
|
||||
-d "{
|
||||
\"name\": \"log_activity\",
|
||||
\"arguments\": {
|
||||
\"agentId\": \"$AGENT_ID\",
|
||||
\"level\": \"error\",
|
||||
\"message\": \"Task timeout: $TASK_ID\"
|
||||
}
|
||||
}"
|
||||
fi
|
||||
}
|
||||
```
|
||||
|
||||
### Crash del Agente
|
||||
|
||||
```typescript
|
||||
// Backend detecta agente sin heartbeat
|
||||
async function checkStaleAgents() {
|
||||
const staleThreshold = new Date(Date.now() - 5 * 60 * 1000) // 5 min
|
||||
|
||||
const staleAgents = await db.query.agents.findMany({
|
||||
where: lt(agents.lastHeartbeat, staleThreshold),
|
||||
})
|
||||
|
||||
for (const agent of staleAgents) {
|
||||
logger.warn(`Agent ${agent.id} is stale`)
|
||||
|
||||
// Mark current task as needs attention
|
||||
if (agent.currentTaskId) {
|
||||
await db.update(tasks)
|
||||
.set({
|
||||
state: 'backlog',
|
||||
assignedAgentId: null,
|
||||
})
|
||||
.where(eq(tasks.id, agent.currentTaskId))
|
||||
}
|
||||
|
||||
// Delete agent pod
|
||||
await k8sClient.deletePod(agent.k8sNamespace, agent.podName)
|
||||
|
||||
// Remove from DB
|
||||
await db.delete(agents).where(eq(agents.id, agent.id))
|
||||
|
||||
// Create replacement
|
||||
await agentManager.createAgent()
|
||||
}
|
||||
}
|
||||
|
||||
// Run every minute
|
||||
setInterval(checkStaleAgents, 60000)
|
||||
```
|
||||
|
||||
## Terminación Graciosa
|
||||
|
||||
```bash
|
||||
# agent-entrypoint.sh
|
||||
cleanup() {
|
||||
echo "🛑 Shutting down agent..."
|
||||
|
||||
# Send offline status
|
||||
curl -X POST "$MCP_SERVER_URL/heartbeat" \
|
||||
-d '{"status":"offline"}' 2>/dev/null || true
|
||||
|
||||
# Kill background jobs
|
||||
kill $HEARTBEAT_PID 2>/dev/null || true
|
||||
|
||||
echo "👋 Goodbye"
|
||||
exit 0
|
||||
}
|
||||
|
||||
trap cleanup SIGTERM SIGINT
|
||||
|
||||
# Wait for signals
|
||||
wait
|
||||
```
|
||||
|
||||
## Auto-Scaling
|
||||
|
||||
```typescript
|
||||
// Auto-scaler que corre cada 30s
|
||||
async function autoScale() {
|
||||
// Get metrics
|
||||
const pendingTasks = await db.query.tasks.findMany({
|
||||
where: eq(tasks.state, 'backlog'),
|
||||
})
|
||||
|
||||
const idleAgents = await db.query.agents.findMany({
|
||||
where: eq(agents.status, 'idle'),
|
||||
})
|
||||
|
||||
const busyAgents = await db.query.agents.findMany({
|
||||
where: eq(agents.status, 'busy'),
|
||||
})
|
||||
|
||||
const totalAgents = idleAgents.length + busyAgents.length
|
||||
|
||||
// Decision logic
|
||||
let targetAgents = totalAgents
|
||||
|
||||
// Scale up if:
|
||||
// - More than 3 pending tasks
|
||||
// - No idle agents
|
||||
if (pendingTasks.length > 3 && idleAgents.length === 0) {
|
||||
targetAgents = Math.min(totalAgents + 2, 10) // Max 10
|
||||
}
|
||||
|
||||
// Scale down if:
|
||||
// - No pending tasks
|
||||
// - More than 2 idle agents
|
||||
if (pendingTasks.length === 0 && idleAgents.length > 2) {
|
||||
targetAgents = Math.max(totalAgents - 1, 2) // Min 2
|
||||
}
|
||||
|
||||
if (targetAgents !== totalAgents) {
|
||||
logger.info(`Auto-scaling: ${totalAgents} → ${targetAgents}`)
|
||||
await agentManager.scaleAgents(targetAgents)
|
||||
}
|
||||
}
|
||||
|
||||
setInterval(autoScale, 30000)
|
||||
```
|
||||
|
||||
## Métricas del Ciclo de Vida
|
||||
|
||||
```typescript
|
||||
// Endpoint para métricas de agentes
|
||||
router.get('/agents/metrics', async (req, res) => {
|
||||
const agents = await db.query.agents.findMany()
|
||||
|
||||
const metrics = {
|
||||
total: agents.length,
|
||||
byStatus: {
|
||||
idle: agents.filter((a) => a.status === 'idle').length,
|
||||
busy: agents.filter((a) => a.status === 'busy').length,
|
||||
error: agents.filter((a) => a.status === 'error').length,
|
||||
offline: agents.filter((a) => a.status === 'offline').length,
|
||||
},
|
||||
totalTasksCompleted: agents.reduce((sum, a) => sum + a.tasksCompleted, 0),
|
||||
avgTasksPerAgent:
|
||||
agents.reduce((sum, a) => sum + a.tasksCompleted, 0) / agents.length || 0,
|
||||
totalRuntime: agents.reduce((sum, a) => sum + a.totalRuntimeMinutes, 0),
|
||||
}
|
||||
|
||||
res.json(metrics)
|
||||
})
|
||||
```
|
||||
|
||||
## Dashboard Visualization
|
||||
|
||||
En el frontend, mostrar:
|
||||
- **Estado actual** de cada agente (idle/busy/error)
|
||||
- **Tarea actual** si está busy
|
||||
- **Historial** de tareas completadas
|
||||
- **Métricas** (tareas/hora, uptime, etc.)
|
||||
- **Botones** para restart/delete agente
|
||||
- **Logs en tiempo real** de cada agente
|
||||
Reference in New Issue
Block a user