AI 应用监控与告警体系

AI 应用的监控比传统应用更复杂。需要监控什么指标？如何设置告警？本文详解 AI 应用监控与告警体系的设计与实践。

一、监控指标体系

1.1 监控层次

AI 应用监控层次：

┌─────────────────────────────────────┐
│ 1. 基础设施层                        │
│    - CPU/内存/磁盘                   │
│    - 网络 IO                         │
│    - GPU 资源                        │
├─────────────────────────────────────┤
│ 2. 应用服务层                        │
│    - 服务可用性                      │
│    - 请求延迟                        │
│    - 错误率                          │
├─────────────────────────────────────┤
│ 3. AI 质量层                         │
│    - 检索质量                        │
│    - 生成质量                        │
│    - 用户满意度                      │
├─────────────────────────────────────┤
│ 4. 业务指标层                        │
│    - 用户活跃度                      │
│    - 任务完成率                      │
│    - 业务转化率                      │
└─────────────────────────────────────┘

1.2 核心指标

# monitoring_metrics.py
from typing import Dict, List

class AIMonitoringMetrics:
    """AI 监控指标"""
    
    # 系统指标
    SYSTEM_METRICS = {
        'cpu_usage': {
            'description': 'CPU 使用率',
            'unit': '%',
            'warning_threshold': 70,
            'critical_threshold': 90
        },
        'memory_usage': {
            'description': '内存使用率',
            'unit': '%',
            'warning_threshold': 70,
            'critical_threshold': 85
        },
        'disk_usage': {
            'description': '磁盘使用率',
            'unit': '%',
            'warning_threshold': 80,
            'critical_threshold': 90
        },
        'gpu_usage': {
            'description': 'GPU 使用率',
            'unit': '%',
            'warning_threshold': 80,
            'critical_threshold': 95
        }
    }
    
    # 应用指标
    APPLICATION_METRICS = {
        'request_rate': {
            'description': '请求速率',
            'unit': 'req/s',
            'warning_threshold': None,
            'critical_threshold': None
        },
        'response_latency_p50': {
            'description': 'P50 响应延迟',
            'unit': 'ms',
            'warning_threshold': 1000,
            'critical_threshold': 3000
        },
        'response_latency_p99': {
            'description': 'P99 响应延迟',
            'unit': 'ms',
            'warning_threshold': 3000,
            'critical_threshold': 5000
        },
        'error_rate': {
            'description': '错误率',
            'unit': '%',
            'warning_threshold': 1,
            'critical_threshold': 5
        },
        'availability': {
            'description': '可用性',
            'unit': '%',
            'warning_threshold': 99.9,
            'critical_threshold': 99
        }
    }
    
    # AI 质量指标
    AI_QUALITY_METRICS = {
        'retrieval_precision': {
            'description': '检索精确率',
            'unit': '%',
            'warning_threshold': 70,
            'critical_threshold': 50
        },
        'retrieval_recall': {
            'description': '检索召回率',
            'unit': '%',
            'warning_threshold': 70,
            'critical_threshold': 50
        },
        'generation_faithfulness': {
            'description': '生成忠实度',
            'unit': '%',
            'warning_threshold': 70,
            'critical_threshold': 50
        },
        'hallucination_rate': {
            'description': '幻觉率',
            'unit': '%',
            'warning_threshold': 10,
            'critical_threshold': 20
        },
        'user_satisfaction': {
            'description': '用户满意度',
            'unit': 'score (1-5)',
            'warning_threshold': 3.5,
            'critical_threshold': 3.0
        }
    }
    
    # 业务指标
    BUSINESS_METRICS = {
        'active_users': {
            'description': '活跃用户数',
            'unit': 'count',
            'warning_threshold': None,
            'critical_threshold': None
        },
        'task_completion_rate': {
            'description': '任务完成率',
            'unit': '%',
            'warning_threshold': 80,
            'critical_threshold': 60
        },
        'cost_per_query': {
            'description': '单次查询成本',
            'unit': 'USD',
            'warning_threshold': 0.05,
            'critical_threshold': 0.10
        }
    }

二、监控系统设计

2.1 监控架构

# monitoring_architecture.py
from typing import Dict, List

class MonitoringArchitecture:
    """监控架构"""
    
    def __init__(self):
        self.components = {
            'data_collection': self._data_collection_layer(),
            'data_storage': self._data_storage_layer(),
            'data_processing': self._data_processing_layer(),
            'alerting': self._alerting_layer(),
            'visualization': self._visualization_layer()
        }
    
    def _data_collection_layer(self) -> Dict:
        """数据采集层"""
        return {
            'tools': [
                'Prometheus Node Exporter',
                'Application Metrics SDK',
                'Log Collectors (Fluentd/Fluent Bit)',
                'Distributed Tracing (Jaeger/Zipkin)'
            ],
            'metrics_types': [
                'system_metrics',
                'application_metrics',
                'business_metrics',
                'ai_quality_metrics'
            ]
        }
    
    def _data_storage_layer(self) -> Dict:
        """数据存储层"""
        return {
            'time_series_db': 'Prometheus/VictoriaMetrics',
            'log_storage': 'Elasticsearch',
            'trace_storage': 'Jaeger/Elasticsearch',
            'alert_storage': 'Alertmanager'
        }
    
    def _data_processing_layer(self) -> Dict:
        """数据处理层"""
        return {
            'aggregation': 'Prometheus Recording Rules',
            'stream_processing': 'Apache Flink/Kafka Streams',
            'batch_processing': 'Apache Spark'
        }
    
    def _alerting_layer(self) -> Dict:
        """告警层"""
        return {
            'alert_manager': 'Prometheus Alertmanager',
            'notification_channels': [
                'Email',
                'Slack',
                'PagerDuty',
                '钉钉',
                '企业微信'
            ],
            'escalation_policy': 'based_on_severity'
        }
    
    def _visualization_layer(self) -> Dict:
        """可视化层"""
        return {
            'dashboards': 'Grafana',
            'custom_reports': 'Kibana',
            'real_time_monitoring': 'Grafana Live'
        }

2.2 指标采集

# metrics_collection.py
from typing import Dict, List
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time

class AIMetricsCollector:
    """AI 指标采集器"""
    
    def __init__(self, port: int = 8000):
        # 系统指标
        self.cpu_usage = Gauge('ai_cpu_usage_percent', 'CPU 使用率')
        self.memory_usage = Gauge('ai_memory_usage_percent', '内存使用率')
        
        # 应用指标
        self.request_count = Counter('ai_requests_total', '总请求数', ['endpoint', 'method'])
        self.request_latency = Histogram('ai_request_latency_seconds', '请求延迟', ['endpoint'])
        self.error_count = Counter('ai_errors_total', '错误数', ['endpoint', 'error_type'])
        
        # AI 质量指标
        self.retrieval_precision = Gauge('ai_retrieval_precision', '检索精确率')
        self.retrieval_recall = Gauge('ai_retrieval_recall', '检索召回率')
        self.generation_faithfulness = Gauge('ai_generation_faithfulness', '生成忠实度')
        self.hallucination_rate = Gauge('ai_hallucination_rate', '幻觉率')
        
        # 业务指标
        self.active_users = Gauge('ai_active_users', '活跃用户数')
        self.task_completion_rate = Gauge('ai_task_completion_rate', '任务完成率')
        self.cost_per_query = Gauge('ai_cost_per_query', '单次查询成本')
        
        # 启动指标服务器
        start_http_server(port)
    
    def record_request(
        self,
        endpoint: str,
        method: str,
        latency: float
    ):
        """记录请求"""
        self.request_count.labels(endpoint=endpoint, method=method).inc()
        self.request_latency.labels(endpoint=endpoint).observe(latency)
    
    def record_error(
        self,
        endpoint: str,
        error_type: str
    ):
        """记录错误"""
        self.error_count.labels(endpoint=endpoint, error_type=error_type).inc()
    
    def record_ai_quality(
        self,
        precision: float,
        recall: float,
        faithfulness: float,
        hallucination_rate: float
    ):
        """记录 AI 质量指标"""
        self.retrieval_precision.set(precision)
        self.retrieval_recall.set(recall)
        self.generation_faithfulness.set(faithfulness)
        self.hallucination_rate.set(hallucination_rate)
    
    def record_business_metrics(
        self,
        active_users: int,
        task_completion_rate: float,
        cost_per_query: float
    ):
        """记录业务指标"""
        self.active_users.set(active_users)
        self.task_completion_rate.set(task_completion_rate)
        self.cost_per_query.set(cost_per_query)

# 使用示例
collector = AIMetricsCollector()

# 记录请求
start_time = time.time()
# ... 处理请求 ...
latency = time.time() - start_time
collector.record_request('/api/query', 'POST', latency)

# 记录 AI 质量
collector.record_ai_quality(
    precision=0.85,
    recall=0.78,
    faithfulness=0.82,
    hallucination_rate=0.05
)

三、告警策略

3.1 告警规则

# alerting_rules.yml
groups:
  - name: AI Application Alerts
    rules:
      # 系统告警
      - alert: HighCPUUsage
        expr: ai_cpu_usage_percent > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "CPU 使用率过高"
          description: "CPU 使用率 {{ $value }}% 超过 80%"
      
      - alert: HighMemoryUsage
        expr: ai_memory_usage_percent > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "内存使用率过高"
          description: "内存使用率 {{ $value }}% 超过 85%"
      
      # 应用告警
      - alert: HighErrorRate
        expr: sum(rate(ai_errors_total[5m])) / sum(rate(ai_requests_total[5m])) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "错误率过高"
          description: "错误率 {{ $value }} 超过 5%"
      
      - alert: HighLatency
        expr: histogram_quantile(0.99, sum(rate(ai_request_latency_seconds_bucket[5m])) by (le)) > 3
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "P99 延迟过高"
          description: "P99 延迟 {{ $value }}s 超过 3s"
      
      # AI 质量告警
      - alert: LowRetrievalPrecision
        expr: ai_retrieval_precision < 0.5
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "检索精确率过低"
          description: "检索精确率 {{ $value }} 低于 50%"
      
      - alert: HighHallucinationRate
        expr: ai_hallucination_rate > 0.2
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "幻觉率过高"
          description: "幻觉率 {{ $value }} 超过 20%"
      
      # 业务告警
      - alert: LowTaskCompletionRate
        expr: ai_task_completion_rate < 0.6
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "任务完成率过低"
          description: "任务完成率 {{ $value }} 低于 60%"

3.2 告警路由

# alertmanager_config.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alerts@example.com'

route:
  group_by: ['alertname', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
    - match:
        severity: critical
      receiver: 'pagerduty'
      continue: true
    - match:
        severity: warning
      receiver: 'slack'
    - match:
        alertname: HighHallucinationRate
      receiver: 'ai-team'

receivers:
  - name: 'default'
    email_configs:
      - to: 'team@example.com'
  
  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: '<pagerduty_service_key>'
  
  - name: 'slack'
    slack_configs:
      - api_url: '<slack_webhook_url>'
        channel: '#alerts'
  
  - name: 'ai-team'
    slack_configs:
      - api_url: '<ai_team_webhook_url>'
        channel: '#ai-alerts'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname']

四、日志管理

4.1 日志规范

# logging_standards.py
import logging
import json
from typing import Dict, Any

class AILoggingStandards:
    """AI 日志规范"""
    
    # 日志级别定义
    LOG_LEVELS = {
        'DEBUG': '调试信息，用于开发调试',
        'INFO': '一般信息，用于记录正常流程',
        'WARNING': '警告信息，用于记录潜在问题',
        'ERROR': '错误信息，用于记录错误',
        'CRITICAL': '严重错误，用于记录系统故障'
    }
    
    # 结构化日志格式
    LOG_FORMAT = {
        'timestamp': 'ISO8601',
        'level': 'string',
        'service': 'string',
        'trace_id': 'string',
        'span_id': 'string',
        'message': 'string',
        'context': 'object'
    }
    
    @staticmethod
    def create_structured_logger(
        name: str,
        level: int = logging.INFO
    ) -> logging.Logger:
        """创建结构化日志器"""
        logger = logging.getLogger(name)
        logger.setLevel(level)
        
        # 创建 JSON 格式化器
        formatter = JSONFormatter()
        
        # 添加控制台处理器
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)
        
        return logger

class JSONFormatter(logging.Formatter):
    """JSON 格式化器"""
    
    def format(self, record: logging.LogRecord) -> str:
        log_data = {
            'timestamp': self.formatTime(record),
            'level': record.levelname,
            'logger': record.name,
            'message': record.getMessage(),
            'module': record.module,
            'function': record.funcName,
            'line': record.lineno
        }
        
        # 添加额外字段
        if hasattr(record, 'context'):
            log_data['context'] = record.context
        
        if record.exc_info:
            log_data['exception'] = self.formatException(record.exc_info)
        
        return json.dumps(log_data, ensure_ascii=False)

# 使用示例
logger = AILoggingStandards.create_structured_logger('ai_app')

# 记录一般日志
logger.info('请求处理完成', extra={'context': {
    'request_id': '123',
    'latency_ms': 150
}})

# 记录错误日志
logger.error('处理失败', extra={'context': {
    'request_id': '123',
    'error': 'LLM API timeout'
}})

4.2 日志分析

# log_analysis.py
from typing import Dict, List
import re

class LogAnalyzer:
    """日志分析器"""
    
    def __init__(self, log_storage):
        self.log_storage = log_storage
    
    def search_logs(
        self,
        query: str,
        time_range: str = '1h',
        limit: int = 100
    ) -> List[Dict]:
        """搜索日志"""
        # 实现日志搜索逻辑
        return []
    
    def analyze_error_patterns(
        self,
        time_range: str = '24h'
    ) -> Dict:
        """分析错误模式"""
        errors = self.search_logs(
            query='level:ERROR',
            time_range=time_range
        )
        
        # 统计错误类型
        error_types = {}
        for log in errors:
            error_type = log.get('context', {}).get('error_type', 'unknown')
            error_types[error_type] = error_types.get(error_type, 0) + 1
        
        return {
            'total_errors': len(errors),
            'error_types': error_types,
            'top_errors': sorted(
                error_types.items(),
                key=lambda x: x[1],
                reverse=True
            )[:10]
        }
    
    def detect_anomalies(
        self,
        metric_name: str,
        time_range: str = '24h'
    ) -> List[Dict]:
        """检测异常"""
        # 实现异常检测逻辑
        return []
    
    def generate_daily_report(self) -> Dict:
        """生成日报"""
        return {
            'date': datetime.now().strftime('%Y-%m-%d'),
            'total_requests': self._count_requests(),
            'total_errors': self._count_errors(),
            'error_rate': self._calculate_error_rate(),
            'avg_latency': self._calculate_avg_latency(),
            'top_issues': self._get_top_issues()
        }

五、可视化仪表板

5.1 Grafana 仪表板

{
  "dashboard": {
    "title": "AI Application Overview",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(ai_requests_total[1m]))",
            "legendFormat": "Requests/s"
          }
        ]
      },
      {
        "title": "Response Latency",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.50, sum(rate(ai_request_latency_seconds_bucket[1m])) by (le))",
            "legendFormat": "P50"
          },
          {
            "expr": "histogram_quantile(0.99, sum(rate(ai_request_latency_seconds_bucket[1m])) by (le))",
            "legendFormat": "P99"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(ai_errors_total[1m])) / sum(rate(ai_requests_total[1m])) * 100",
            "legendFormat": "Error Rate %"
          }
        ]
      },
      {
        "title": "AI Quality Metrics",
        "type": "graph",
        "targets": [
          {
            "expr": "ai_retrieval_precision * 100",
            "legendFormat": "Retrieval Precision %"
          },
          {
            "expr": "ai_generation_faithfulness * 100",
            "legendFormat": "Generation Faithfulness %"
          },
          {
            "expr": "ai_hallucination_rate * 100",
            "legendFormat": "Hallucination Rate %"
          }
        ]
      },
      {
        "title": "User Satisfaction",
        "type": "singlestat",
        "targets": [
          {
            "expr": "ai_user_satisfaction",
            "legendFormat": "Avg Rating"
          }
        ]
      },
      {
        "title": "Cost per Query",
        "type": "singlestat",
        "targets": [
          {
            "expr": "ai_cost_per_query",
            "legendFormat": "USD"
          }
        ]
      }
    ]
  }
}

六、总结

6.1 核心要点

监控指标
- 系统指标
- 应用指标
- AI 质量指标
- 业务指标
告警策略
- 分级告警
- 合理阈值
- 有效路由
日志管理
- 结构化日志
- 集中存储
- 智能分析

6.2 最佳实践

全面监控
- 多层次指标
- 实时采集
- 长期存储
智能告警
- 避免告警疲劳
- 有效告警路由
- 自动恢复
持续改进
- 定期回顾告警
- 优化阈值
- 完善文档

参考资料