PrometheusGrafana监控实战引言在微服务架构中监控系统是保障系统稳定性和可靠性的关键基础设施。Prometheus是CNCF毕业的开源监控系统以其强大的多维度数据模型、高效的查询语言PromQL和灵活的架构设计成为云原生监控的事实标准。Grafana则是最流行的可视化平台可以连接多种数据源创建丰富的仪表板。本文将详细介绍Prometheus和Grafana的部署配置、在Spring Boot应用中的集成方法以及构建完整监控体系的最佳实践。一、Prometheus核心概念1.1 数据模型Prometheus采用时序数据库存储数据每条时序数据由指标名称、标签集和时间戳组成。标签允许对指标进行多维度切分PromQL可以基于这些标签进行灵活查询。# 指标格式 metric_name{label1value1, label2value2} value timestamp # 示例 http_requests_total{methodGET, status200, handler/api/users} 1523 1704067200 process_cpu_seconds_total{instanceorder-service:8080} 4523.56 17040672001.2 四种指标类型Counter是只增不减的计数器用于统计请求数、错误数等Gauge是可增可减的仪表盘用于记录当前连接数、内存使用量等Histogram用于记录观察值的分布自动计算分位数Summary与Histogram类似但由客户端计算分位数。// Spring Boot Micrometer中的指标定义 Configuration public class MetricsConfig { Bean public Counter orderCreatedCounter(MeterRegistry registry) { return Counter.builder(orders.created) .description(Number of orders created) .tag(type, online) .register(registry); } Bean public Gauge activeConnectionsGauge(MeterRegistry registry) { return Gauge.builder(connections.active, connectionPool, ConnectionPool::getActiveCount) .description(Number of active connections) .register(registry); } Bean public Timer orderProcessingTimer(MeterRegistry registry) { return Timer.builder(order.processing.duration) .description(Time taken to process orders) .publishPercentiles(0.5, 0.95, 0.99) .register(registry); } Bean public DistributionSummary orderAmountSummary(MeterRegistry registry) { return DistributionSummary.builder(order.amount) .description(Order amount distribution) .publishPercentiles(0.5, 0.95, 0.99) .register(registry); } }二、Prometheus部署配置2.1 Kubernetes部署# prometheus-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: monitoring data: prometheus.yml: | global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: production environment: kubernetes alerting: alertmanagers: - static_configs: - targets: - alertmanager.monitoring.svc:9093 rule_files: - /etc/prometheus/rules/*.yml scrape_configs: # Prometheus自我监控 - job_name: prometheus static_configs: - targets: [localhost:9090] # Kubernetes API Server - job_name: kubernetes-apiservers kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https # Kubernetes Pods - job_name: kubernetes-pods kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:])(?::\d)?;(\d) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.)2.2 Prometheus Operator# Prometheus CRD apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: prometheus namespace: monitoring spec: replicas: 2 retention: 15d retentionSize: 50GB serviceAccountName: prometheus serviceMonitorSelector: matchLabels: team: frontend ruleSelector: matchLabels: role: alert-rules alerting: alertmanagers: - namespace: monitoring name: alertmanager-main port: web resources: requests: memory: 2Gi cpu: 1000m limits: memory: 8Gi cpu: 4000m storage: volumeClaimTemplate: spec: storageClassName: ssd resources: requests: storage: 100Gi三、Spring Boot应用集成3.1 引入依赖dependencies !-- Actuator暴露健康指标 -- dependency groupIdorg.springframework.boot/groupId artifactIdspring-boot-starter-actuator/artifactId /dependency !-- Micrometer Prometheus注册器 -- dependency groupIdio.micrometer/groupId artifactIdmicrometer-registry-prometheus/artifactId /dependency !-- JVM和系统指标 -- dependency groupIdio.micrometer/groupId artifactIdmicrometer-core/artifactId /dependency /dependencies3.2 配置文件management: endpoints: web: exposure: include: health,info,prometheus,metrics,logfile base-path: /actuator endpoint: health: show-details: always probes: enabled: true prometheus: enabled: true metrics: export: prometheus: enabled: true distribution: percentiles-histogram: http.server.requests: true percentiles: http.server.requests: 0.5, 0.95, 0.99 slo: http.server.requests: 50ms, 100ms, 200ms, 500ms, 1s tags: application: ${spring.application.name} environment: ${ENV:development}3.3 自定义业务指标Service public class OrderMetricsService { private final Counter orderSuccessCounter; private final Counter orderFailedCounter; private final Timer orderProcessingTimer; private final Gauge activeOrdersGauge; private final AtomicInteger activeOrders; public OrderMetricsService(MeterRegistry registry) { this.orderSuccessCounter Counter.builder(orders.success) .description(Successful order count) .tag(type, online) .register(registry); this.orderFailedCounter Counter.builder(orders.failed) .description(Failed order count) .tag(type, online) .register(registry); this.orderProcessingTimer Timer.builder(orders.processing.time) .description(Order processing duration) .publishPercentiles(0.5, 0.95, 0.99) .register(registry); this.activeOrders new AtomicInteger(0); this.activeOrdersGauge Gauge.builder(orders.active, activeOrders, AtomicInteger::get) .description(Active orders count) .register(registry); } public void recordOrderSuccess(Order order) { orderSuccessCounter.increment(); orderProcessingTimer.record(order.getProcessingTime(), TimeUnit.MILLISECONDS); } public void recordOrderFailed(Order order, Throwable error) { orderFailedCounter.increment(); Tags tags Tags.of(error, error.getClass().getSimpleName()); Counter.builder(orders.failed.detailed) .tags(tags) .register(Registry.class.cast(orderSuccessCounter.getId())) .increment(); } public void incrementActiveOrders() { activeOrders.incrementAndGet(); } public void decrementActiveOrders() { activeOrders.decrementAndGet(); } }3.4 ServiceMonitor配置apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: order-service-monitor namespace: monitoring labels: team: frontend spec: selector: matchLabels: app: order-service namespaceSelector: matchNames: - production endpoints: - port: web path: /actuator/prometheus interval: 15s scrapeTimeout: 10s relabelings: - sourceLabels: [__meta_kubernetes_pod_name] targetLabel: pod - sourceLabels: [__meta_kubernetes_namespace] targetLabel: namespace jobLabel: order-service四、AlertManager告警配置4.1 告警规则apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: order-service-alerts namespace: production labels: role: alert-rules spec: groups: - name: order-service.rules rules: - alert: HighErrorRate expr: | sum(rate(orders_failed_total[5m])) / sum(rate(orders_success_total[5m])) 0.05 for: 5m labels: severity: critical team: backend annotations: summary: High error rate detected description: Order service error rate is above 5% for 5 minutes - alert: HighLatency expr: | histogram_quantile(0.95, sum(rate(orders_processing_time_seconds_bucket[5m])) by (le)) 2 for: 5m labels: severity: warning annotations: summary: High order processing latency description: 95th percentile latency is above 2 seconds - alert: InstanceDown expr: up{joborder-service} 0 for: 1m labels: severity: critical annotations: summary: Instance down description: Order service instance is down - alert: HighMemoryUsage expr: | (jvm_memory_used_bytes{areaheap} / jvm_memory_max_bytes{areaheap}) 0.9 for: 5m labels: severity: warning annotations: summary: High JVM memory usage description: JVM heap usage is above 90%4.2 AlertManager配置apiVersion: monitoring.coreos.com/v1 kind: Alertmanager metadata: name: alertmanager-main namespace: monitoring spec: replicas: 2 config: route: group_by: [alertname, severity] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: default-receiver routes: - match: severity: critical receiver: critical-receiver continue: true - match: team: backend receiver: backend-team receivers: - name: default-receiver webhook_configs: - url: http://notification-service:8080/webhook send_resolved: true - name: critical-receiver pagerduty_configs: - service_key: YOUR_PAGERDUTY_KEY severity: critical - name: backend-team email_configs: - to: backend-teamexample.com send_resolved: true五、Grafana仪表板5.1 数据源配置apiVersion: v1 kind: ConfigMap metadata: name: grafana-datasources namespace: monitoring data: prometheus.yaml: | apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: true jsonData: timeInterval: 15s queryTimeout: 60s5.2 JVM监控仪表板{ dashboard: { title: JVM Performance Dashboard, panels: [ { title: JVM Memory Usage, type: timeseries, gridPos: {x: 0, y: 0, w: 12, h: 8}, targets: [ { expr: jvm_memory_used_bytes{area\heap\}, legendFormat: {{id}} - {{application}} } ], fieldConfig: { defaults: { unit: bytes, custom: { drawStyle: line, lineWidth: 2 } } } }, { title: GC Metrics, type: timeseries, gridPos: {x: 12, y: 0, w: 12, h: 8}, targets: [ { expr: rate(jvm_gc_pause_seconds_sum[5m]), legendFormat: {{action}} - {{cause}} } ] }, { title: Thread Count, type: stat, gridPos: {x: 0, y: 8, w: 6, h: 4}, targets: [ { expr: jvm_threads_live_threads{application\order-service\} } ] }, { title: HTTP Request Rate, type: timeseries, gridPos: {x: 6, y: 8, w: 18, h: 8}, targets: [ { expr: sum(rate(http_server_requests_seconds_count[5m])) by (uri, status), legendFormat: {{uri}} - {{status}} } ] } ] } }六、告警通知集成6.1 钉钉告警apiVersion: v1 kind: Secret metadata: name: dingtalk-webhook namespace: monitoring type: Opaque stringData: url: https://oapi.dingtalk.com/robot/send?access_tokenYOUR_TOKEN --- apiVersion: monitoring.coreos.com/v1 kind: AlertmanagerConfig metadata: name: dingtalk-config namespace: monitoring spec: receivers: - name: dingtalk dingtalkConfigs: - webhook: url: key: url name: dingtalk-webhook msgType: markdown atAll: false route: groupBy: [alertname] receiver: dingtalk6.2 企业微信告警apiVersion: monitoring.coreos.com/v1 kind: AlertmanagerConfig metadata: name: wechat-config namespace: monitoring spec: receivers: - name: wechat wechatConfigs: - apiURL: url: https://qyapi.weixin.qq.com/cgi-bin/ corpID: YOUR_CORP_ID agentID: 1000001 apiSecret: name: wechat-api-secret key: secret toParty: 1 toUser: all七、最佳实践7.1 指标命名规范指标名称应遵循以下规范使用小写字母和下划线包含功能域前缀包含度量单位后缀包含描述性的复数名词。# 推荐命名 order_processing_duration_seconds user_login_total cache_hit_ratio # 不推荐命名 OrderProcessingTime /User/Login/Count CacheHitRate7.2 标签使用建议避免使用高基数标签如用户ID、请求ID等标签值应该有限且稳定避免标签数量过多使用role、instance、job等标准标签。7.3 性能优化# Prometheus远程写入配置 remote_write: - url: https://remote-write-endpoint/api/v1/write queue_config: capacity: 10000 max_shards: 30 min_shards: 1 max_samples_per_send: 5000 batch_send_deadline: 30s总结Prometheus和Grafana的组合提供了完整的云原生监控解决方案。Prometheus负责指标的采集、存储和告警Grafana负责数据的可视化和分析。通过在Spring Boot应用中集成Micrometer可以轻松暴露丰富的业务指标和JVM指标。结合AlertManager和各类通知渠道可以实现及时的问题告警。建立完善的监控体系是保障系统稳定性的基础也是SRE实践的重要组成部分。
Prometheus+Grafana监控实战
发布时间:2026/5/17 4:07:42
PrometheusGrafana监控实战引言在微服务架构中监控系统是保障系统稳定性和可靠性的关键基础设施。Prometheus是CNCF毕业的开源监控系统以其强大的多维度数据模型、高效的查询语言PromQL和灵活的架构设计成为云原生监控的事实标准。Grafana则是最流行的可视化平台可以连接多种数据源创建丰富的仪表板。本文将详细介绍Prometheus和Grafana的部署配置、在Spring Boot应用中的集成方法以及构建完整监控体系的最佳实践。一、Prometheus核心概念1.1 数据模型Prometheus采用时序数据库存储数据每条时序数据由指标名称、标签集和时间戳组成。标签允许对指标进行多维度切分PromQL可以基于这些标签进行灵活查询。# 指标格式 metric_name{label1value1, label2value2} value timestamp # 示例 http_requests_total{methodGET, status200, handler/api/users} 1523 1704067200 process_cpu_seconds_total{instanceorder-service:8080} 4523.56 17040672001.2 四种指标类型Counter是只增不减的计数器用于统计请求数、错误数等Gauge是可增可减的仪表盘用于记录当前连接数、内存使用量等Histogram用于记录观察值的分布自动计算分位数Summary与Histogram类似但由客户端计算分位数。// Spring Boot Micrometer中的指标定义 Configuration public class MetricsConfig { Bean public Counter orderCreatedCounter(MeterRegistry registry) { return Counter.builder(orders.created) .description(Number of orders created) .tag(type, online) .register(registry); } Bean public Gauge activeConnectionsGauge(MeterRegistry registry) { return Gauge.builder(connections.active, connectionPool, ConnectionPool::getActiveCount) .description(Number of active connections) .register(registry); } Bean public Timer orderProcessingTimer(MeterRegistry registry) { return Timer.builder(order.processing.duration) .description(Time taken to process orders) .publishPercentiles(0.5, 0.95, 0.99) .register(registry); } Bean public DistributionSummary orderAmountSummary(MeterRegistry registry) { return DistributionSummary.builder(order.amount) .description(Order amount distribution) .publishPercentiles(0.5, 0.95, 0.99) .register(registry); } }二、Prometheus部署配置2.1 Kubernetes部署# prometheus-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: monitoring data: prometheus.yml: | global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: production environment: kubernetes alerting: alertmanagers: - static_configs: - targets: - alertmanager.monitoring.svc:9093 rule_files: - /etc/prometheus/rules/*.yml scrape_configs: # Prometheus自我监控 - job_name: prometheus static_configs: - targets: [localhost:9090] # Kubernetes API Server - job_name: kubernetes-apiservers kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https # Kubernetes Pods - job_name: kubernetes-pods kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:])(?::\d)?;(\d) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.)2.2 Prometheus Operator# Prometheus CRD apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: prometheus namespace: monitoring spec: replicas: 2 retention: 15d retentionSize: 50GB serviceAccountName: prometheus serviceMonitorSelector: matchLabels: team: frontend ruleSelector: matchLabels: role: alert-rules alerting: alertmanagers: - namespace: monitoring name: alertmanager-main port: web resources: requests: memory: 2Gi cpu: 1000m limits: memory: 8Gi cpu: 4000m storage: volumeClaimTemplate: spec: storageClassName: ssd resources: requests: storage: 100Gi三、Spring Boot应用集成3.1 引入依赖dependencies !-- Actuator暴露健康指标 -- dependency groupIdorg.springframework.boot/groupId artifactIdspring-boot-starter-actuator/artifactId /dependency !-- Micrometer Prometheus注册器 -- dependency groupIdio.micrometer/groupId artifactIdmicrometer-registry-prometheus/artifactId /dependency !-- JVM和系统指标 -- dependency groupIdio.micrometer/groupId artifactIdmicrometer-core/artifactId /dependency /dependencies3.2 配置文件management: endpoints: web: exposure: include: health,info,prometheus,metrics,logfile base-path: /actuator endpoint: health: show-details: always probes: enabled: true prometheus: enabled: true metrics: export: prometheus: enabled: true distribution: percentiles-histogram: http.server.requests: true percentiles: http.server.requests: 0.5, 0.95, 0.99 slo: http.server.requests: 50ms, 100ms, 200ms, 500ms, 1s tags: application: ${spring.application.name} environment: ${ENV:development}3.3 自定义业务指标Service public class OrderMetricsService { private final Counter orderSuccessCounter; private final Counter orderFailedCounter; private final Timer orderProcessingTimer; private final Gauge activeOrdersGauge; private final AtomicInteger activeOrders; public OrderMetricsService(MeterRegistry registry) { this.orderSuccessCounter Counter.builder(orders.success) .description(Successful order count) .tag(type, online) .register(registry); this.orderFailedCounter Counter.builder(orders.failed) .description(Failed order count) .tag(type, online) .register(registry); this.orderProcessingTimer Timer.builder(orders.processing.time) .description(Order processing duration) .publishPercentiles(0.5, 0.95, 0.99) .register(registry); this.activeOrders new AtomicInteger(0); this.activeOrdersGauge Gauge.builder(orders.active, activeOrders, AtomicInteger::get) .description(Active orders count) .register(registry); } public void recordOrderSuccess(Order order) { orderSuccessCounter.increment(); orderProcessingTimer.record(order.getProcessingTime(), TimeUnit.MILLISECONDS); } public void recordOrderFailed(Order order, Throwable error) { orderFailedCounter.increment(); Tags tags Tags.of(error, error.getClass().getSimpleName()); Counter.builder(orders.failed.detailed) .tags(tags) .register(Registry.class.cast(orderSuccessCounter.getId())) .increment(); } public void incrementActiveOrders() { activeOrders.incrementAndGet(); } public void decrementActiveOrders() { activeOrders.decrementAndGet(); } }3.4 ServiceMonitor配置apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: order-service-monitor namespace: monitoring labels: team: frontend spec: selector: matchLabels: app: order-service namespaceSelector: matchNames: - production endpoints: - port: web path: /actuator/prometheus interval: 15s scrapeTimeout: 10s relabelings: - sourceLabels: [__meta_kubernetes_pod_name] targetLabel: pod - sourceLabels: [__meta_kubernetes_namespace] targetLabel: namespace jobLabel: order-service四、AlertManager告警配置4.1 告警规则apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: order-service-alerts namespace: production labels: role: alert-rules spec: groups: - name: order-service.rules rules: - alert: HighErrorRate expr: | sum(rate(orders_failed_total[5m])) / sum(rate(orders_success_total[5m])) 0.05 for: 5m labels: severity: critical team: backend annotations: summary: High error rate detected description: Order service error rate is above 5% for 5 minutes - alert: HighLatency expr: | histogram_quantile(0.95, sum(rate(orders_processing_time_seconds_bucket[5m])) by (le)) 2 for: 5m labels: severity: warning annotations: summary: High order processing latency description: 95th percentile latency is above 2 seconds - alert: InstanceDown expr: up{joborder-service} 0 for: 1m labels: severity: critical annotations: summary: Instance down description: Order service instance is down - alert: HighMemoryUsage expr: | (jvm_memory_used_bytes{areaheap} / jvm_memory_max_bytes{areaheap}) 0.9 for: 5m labels: severity: warning annotations: summary: High JVM memory usage description: JVM heap usage is above 90%4.2 AlertManager配置apiVersion: monitoring.coreos.com/v1 kind: Alertmanager metadata: name: alertmanager-main namespace: monitoring spec: replicas: 2 config: route: group_by: [alertname, severity] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: default-receiver routes: - match: severity: critical receiver: critical-receiver continue: true - match: team: backend receiver: backend-team receivers: - name: default-receiver webhook_configs: - url: http://notification-service:8080/webhook send_resolved: true - name: critical-receiver pagerduty_configs: - service_key: YOUR_PAGERDUTY_KEY severity: critical - name: backend-team email_configs: - to: backend-teamexample.com send_resolved: true五、Grafana仪表板5.1 数据源配置apiVersion: v1 kind: ConfigMap metadata: name: grafana-datasources namespace: monitoring data: prometheus.yaml: | apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: true jsonData: timeInterval: 15s queryTimeout: 60s5.2 JVM监控仪表板{ dashboard: { title: JVM Performance Dashboard, panels: [ { title: JVM Memory Usage, type: timeseries, gridPos: {x: 0, y: 0, w: 12, h: 8}, targets: [ { expr: jvm_memory_used_bytes{area\heap\}, legendFormat: {{id}} - {{application}} } ], fieldConfig: { defaults: { unit: bytes, custom: { drawStyle: line, lineWidth: 2 } } } }, { title: GC Metrics, type: timeseries, gridPos: {x: 12, y: 0, w: 12, h: 8}, targets: [ { expr: rate(jvm_gc_pause_seconds_sum[5m]), legendFormat: {{action}} - {{cause}} } ] }, { title: Thread Count, type: stat, gridPos: {x: 0, y: 8, w: 6, h: 4}, targets: [ { expr: jvm_threads_live_threads{application\order-service\} } ] }, { title: HTTP Request Rate, type: timeseries, gridPos: {x: 6, y: 8, w: 18, h: 8}, targets: [ { expr: sum(rate(http_server_requests_seconds_count[5m])) by (uri, status), legendFormat: {{uri}} - {{status}} } ] } ] } }六、告警通知集成6.1 钉钉告警apiVersion: v1 kind: Secret metadata: name: dingtalk-webhook namespace: monitoring type: Opaque stringData: url: https://oapi.dingtalk.com/robot/send?access_tokenYOUR_TOKEN --- apiVersion: monitoring.coreos.com/v1 kind: AlertmanagerConfig metadata: name: dingtalk-config namespace: monitoring spec: receivers: - name: dingtalk dingtalkConfigs: - webhook: url: key: url name: dingtalk-webhook msgType: markdown atAll: false route: groupBy: [alertname] receiver: dingtalk6.2 企业微信告警apiVersion: monitoring.coreos.com/v1 kind: AlertmanagerConfig metadata: name: wechat-config namespace: monitoring spec: receivers: - name: wechat wechatConfigs: - apiURL: url: https://qyapi.weixin.qq.com/cgi-bin/ corpID: YOUR_CORP_ID agentID: 1000001 apiSecret: name: wechat-api-secret key: secret toParty: 1 toUser: all七、最佳实践7.1 指标命名规范指标名称应遵循以下规范使用小写字母和下划线包含功能域前缀包含度量单位后缀包含描述性的复数名词。# 推荐命名 order_processing_duration_seconds user_login_total cache_hit_ratio # 不推荐命名 OrderProcessingTime /User/Login/Count CacheHitRate7.2 标签使用建议避免使用高基数标签如用户ID、请求ID等标签值应该有限且稳定避免标签数量过多使用role、instance、job等标准标签。7.3 性能优化# Prometheus远程写入配置 remote_write: - url: https://remote-write-endpoint/api/v1/write queue_config: capacity: 10000 max_shards: 30 min_shards: 1 max_samples_per_send: 5000 batch_send_deadline: 30s总结Prometheus和Grafana的组合提供了完整的云原生监控解决方案。Prometheus负责指标的采集、存储和告警Grafana负责数据的可视化和分析。通过在Spring Boot应用中集成Micrometer可以轻松暴露丰富的业务指标和JVM指标。结合AlertManager和各类通知渠道可以实现及时的问题告警。建立完善的监控体系是保障系统稳定性的基础也是SRE实践的重要组成部分。