Files
aiworker/docs/05-agents/ciclo-vida.md
Hector Ros db71705842 Complete documentation for future sessions
- CLAUDE.md for AI agents to understand the codebase
- GITEA-GUIDE.md centralizes all Gitea operations (API, Registry, Auth)
- DEVELOPMENT-WORKFLOW.md explains complete dev process
- ROADMAP.md, NEXT-SESSION.md for planning
- QUICK-REFERENCE.md, TROUBLESHOOTING.md for daily use
- 40+ detailed docs in /docs folder
- Backend as submodule from Gitea

Everything documented for autonomous operation.

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
2026-01-20 00:37:19 +01:00

9.8 KiB

Ciclo de Vida de los Agentes

Estados del Agente

┌──────────────┐
│ Initializing │
└──────┬───────┘
       │
       ▼
   ┌──────┐     ┌──────┐
   │ Idle │◄───►│ Busy │
   └───┬──┘     └──┬───┘
       │           │
       │           │
       ▼           ▼
   ┌───────┐   ┌───────┐
   │ Error │   │Offline│
   └───────┘   └───────┘

Inicialización

1. Creación del Pod

// Backend crea el pod
const agentManager = new AgentManager()
const agent = await agentManager.createAgent(['javascript', 'react'])

// Resultado
{
  id: 'agent-abc123',
  podName: 'claude-agent-abc123',
  namespace: 'agents',
  status: 'initializing'
}

2. Arranque del Contenedor

# En el pod (entrypoint.sh)
echo "🤖 Starting agent: $AGENT_ID"

# 1. Setup SSH
echo "$GIT_SSH_KEY" > /root/.ssh/id_ed25519
chmod 600 /root/.ssh/id_ed25519

# 2. Configure Claude Code MCP
cat > /root/.claude-code/config.json <<EOF
{
  "mcpServers": {
    "aiworker": {
      "url": "$MCP_SERVER_URL"
    }
  }
}
EOF

# 3. Send initial heartbeat
curl -X POST "$MCP_SERVER_URL/heartbeat" \
  -H "Content-Type: application/json" \
  -H "X-Agent-ID: $AGENT_ID" \
  -d '{"status":"idle"}'

# 4. Start work loop
exec /usr/local/bin/agent-loop.sh

3. Registro en el Sistema

// Backend detecta el heartbeat y actualiza
await db.update(agents)
  .set({
    status: 'idle',
    lastHeartbeat: new Date(),
  })
  .where(eq(agents.id, agentId))

logger.info(`Agent ${agentId} is now active`)

Asignación de Tarea

1. Agent Polling

# agent-loop.sh
while true; do
  echo "📋 Checking for tasks..."

  TASK=$(curl -s -X POST "$MCP_SERVER_URL/tools/call" \
    -H "Content-Type: application/json" \
    -d "{
      \"name\": \"get_next_task\",
      \"arguments\": {\"agentId\": \"$AGENT_ID\"}
    }")

  TASK_ID=$(echo "$TASK" | jq -r '.content[0].text | fromjson | .task.id // empty')

  if [ -n "$TASK_ID" ]; then
    echo "🎯 Got task: $TASK_ID"
    process_task "$TASK_ID"
  else
    sleep 10
  fi
done

2. Backend Asigna Tarea

// services/mcp/handlers.ts - getNextTask()
async function getNextTask(args: { agentId: string }) {
  // 1. Buscar siguiente tarea en backlog
  const task = await db.query.tasks.findFirst({
    where: eq(tasks.state, 'backlog'),
    orderBy: [desc(tasks.priority), asc(tasks.createdAt)],
  })

  if (!task) {
    return { content: [{ type: 'text', text: JSON.stringify({ message: 'No tasks' }) }] }
  }

  // 2. Asignar al agente
  await db.update(tasks)
    .set({
      state: 'in_progress',
      assignedAgentId: args.agentId,
      assignedAt: new Date(),
      startedAt: new Date(),
    })
    .where(eq(tasks.id, task.id))

  // 3. Actualizar agente
  await db.update(agents)
    .set({
      status: 'busy',
      currentTaskId: task.id,
    })
    .where(eq(agents.id, args.agentId))

  // 4. Retornar tarea
  return {
    content: [{
      type: 'text',
      text: JSON.stringify({ task }),
    }],
  }
}

Trabajo en Tarea

Fase 1: Setup

# Clone repo
git clone "$PROJECT_REPO" "/workspace/task-$TASK_ID"
cd "/workspace/task-$TASK_ID"

# Create branch (via MCP)
curl -X POST "$MCP_SERVER_URL/tools/call" \
  -d "{\"name\": \"create_branch\", \"arguments\": {\"taskId\": \"$TASK_ID\"}}"

# Checkout branch
git fetch origin
git checkout "$BRANCH_NAME"

Fase 2: Implementación

# Start Claude Code session
claude-code chat --message "
I need you to work on this task:

Title: $TASK_TITLE
Description: $TASK_DESC

Instructions:
1. Analyze the codebase
2. Implement the changes
3. Write tests
4. Commit with clear messages
5. Use MCP tools when done

Start working now.
"

Fase 3: Preguntas (opcional)

// Si el agente necesita info
await mcp.callTool('ask_user_question', {
  taskId,
  question: 'Should I add TypeScript types?',
  context: 'The codebase is in JavaScript...',
})

// Cambiar estado a needs_input
await mcp.callTool('update_task_status', {
  taskId,
  status: 'needs_input',
})

// Hacer polling cada 5s hasta respuesta
let response
while (!response) {
  await sleep(5000)
  const check = await mcp.callTool('check_question_response', { taskId })
  if (check.hasResponse) {
    response = check.response
  }
}

// Continuar con la respuesta
await mcp.callTool('update_task_status', {
  taskId,
  status: 'in_progress',
})

Fase 4: Finalización

# Create PR
curl -X POST "$MCP_SERVER_URL/tools/call" \
  -d "{
    \"name\": \"create_pull_request\",
    \"arguments\": {
      \"taskId\": \"$TASK_ID\",
      \"title\": \"$TASK_TITLE\",
      \"description\": \"Implemented feature X...\"
    }
  }"

# Deploy preview
curl -X POST "$MCP_SERVER_URL/tools/call" \
  -d "{
    \"name\": \"trigger_preview_deploy\",
    \"arguments\": {\"taskId\": \"$TASK_ID\"}
  }"

# Update status
curl -X POST "$MCP_SERVER_URL/tools/call" \
  -d "{
    \"name\": \"update_task_status\",
    \"arguments\": {
      \"taskId\": \"$TASK_ID\",
      \"status\": \"ready_to_test\"
    }
  }"

Liberación del Agente

// Cuando tarea completa (ready_to_test o completed)
await db.update(agents)
  .set({
    status: 'idle',
    currentTaskId: null,
    tasksCompleted: sql`tasks_completed + 1`,
  })
  .where(eq(agents.id, agentId))

logger.info(`Agent ${agentId} completed task ${taskId}, now idle`)

Manejo de Errores

Timeout de Tarea

# agent-loop.sh con timeout
timeout 7200 claude-code chat --message "$TASK_PROMPT" || {
  STATUS=$?
  if [ $STATUS -eq 124 ]; then
    echo "⏰ Task timeout after 2 hours"

    # Notify backend
    curl -X POST "$MCP_SERVER_URL/tools/call" \
      -d "{
        \"name\": \"update_task_status\",
        \"arguments\": {
          \"taskId\": \"$TASK_ID\",
          \"status\": \"needs_input\",
          \"metadata\": {\"reason\": \"timeout\"}
        }
      }"

    # Log error
    curl -X POST "$MCP_SERVER_URL/tools/call" \
      -d "{
        \"name\": \"log_activity\",
        \"arguments\": {
          \"agentId\": \"$AGENT_ID\",
          \"level\": \"error\",
          \"message\": \"Task timeout: $TASK_ID\"
        }
      }"
  fi
}

Crash del Agente

// Backend detecta agente sin heartbeat
async function checkStaleAgents() {
  const staleThreshold = new Date(Date.now() - 5 * 60 * 1000) // 5 min

  const staleAgents = await db.query.agents.findMany({
    where: lt(agents.lastHeartbeat, staleThreshold),
  })

  for (const agent of staleAgents) {
    logger.warn(`Agent ${agent.id} is stale`)

    // Mark current task as needs attention
    if (agent.currentTaskId) {
      await db.update(tasks)
        .set({
          state: 'backlog',
          assignedAgentId: null,
        })
        .where(eq(tasks.id, agent.currentTaskId))
    }

    // Delete agent pod
    await k8sClient.deletePod(agent.k8sNamespace, agent.podName)

    // Remove from DB
    await db.delete(agents).where(eq(agents.id, agent.id))

    // Create replacement
    await agentManager.createAgent()
  }
}

// Run every minute
setInterval(checkStaleAgents, 60000)

Terminación Graciosa

# agent-entrypoint.sh
cleanup() {
  echo "🛑 Shutting down agent..."

  # Send offline status
  curl -X POST "$MCP_SERVER_URL/heartbeat" \
    -d '{"status":"offline"}' 2>/dev/null || true

  # Kill background jobs
  kill $HEARTBEAT_PID 2>/dev/null || true

  echo "👋 Goodbye"
  exit 0
}

trap cleanup SIGTERM SIGINT

# Wait for signals
wait

Auto-Scaling

// Auto-scaler que corre cada 30s
async function autoScale() {
  // Get metrics
  const pendingTasks = await db.query.tasks.findMany({
    where: eq(tasks.state, 'backlog'),
  })

  const idleAgents = await db.query.agents.findMany({
    where: eq(agents.status, 'idle'),
  })

  const busyAgents = await db.query.agents.findMany({
    where: eq(agents.status, 'busy'),
  })

  const totalAgents = idleAgents.length + busyAgents.length

  // Decision logic
  let targetAgents = totalAgents

  // Scale up if:
  // - More than 3 pending tasks
  // - No idle agents
  if (pendingTasks.length > 3 && idleAgents.length === 0) {
    targetAgents = Math.min(totalAgents + 2, 10) // Max 10
  }

  // Scale down if:
  // - No pending tasks
  // - More than 2 idle agents
  if (pendingTasks.length === 0 && idleAgents.length > 2) {
    targetAgents = Math.max(totalAgents - 1, 2) // Min 2
  }

  if (targetAgents !== totalAgents) {
    logger.info(`Auto-scaling: ${totalAgents}${targetAgents}`)
    await agentManager.scaleAgents(targetAgents)
  }
}

setInterval(autoScale, 30000)

Métricas del Ciclo de Vida

// Endpoint para métricas de agentes
router.get('/agents/metrics', async (req, res) => {
  const agents = await db.query.agents.findMany()

  const metrics = {
    total: agents.length,
    byStatus: {
      idle: agents.filter((a) => a.status === 'idle').length,
      busy: agents.filter((a) => a.status === 'busy').length,
      error: agents.filter((a) => a.status === 'error').length,
      offline: agents.filter((a) => a.status === 'offline').length,
    },
    totalTasksCompleted: agents.reduce((sum, a) => sum + a.tasksCompleted, 0),
    avgTasksPerAgent:
      agents.reduce((sum, a) => sum + a.tasksCompleted, 0) / agents.length || 0,
    totalRuntime: agents.reduce((sum, a) => sum + a.totalRuntimeMinutes, 0),
  }

  res.json(metrics)
})

Dashboard Visualization

En el frontend, mostrar:

  • Estado actual de cada agente (idle/busy/error)
  • Tarea actual si está busy
  • Historial de tareas completadas
  • Métricas (tareas/hora, uptime, etc.)
  • Botones para restart/delete agente
  • Logs en tiempo real de cada agente