Title here
Summary here
阅读时间: 约 20 分钟 适用人群: 运维工程师、DevOps
本文提供 UCM 在生产环境中的部署指南,包括单机部署、分布式部署和容器化部署。
| 组件 | 最低要求 | 推荐配置 |
|---|---|---|
| 操作系统 | Linux (Ubuntu 20.04+) | Ubuntu 22.04 LTS |
| CPU | 8 核 | 32+ 核 |
| 内存 | 32 GB | 128+ GB |
| GPU | 1x 16GB 显存 | 多卡 A100/H100 |
| 存储 | 100 GB SSD | NVMe SSD 1TB+ |
| 网络 | 1 Gbps | 25 Gbps+ (分布式) |
Python >= 3.10
PyTorch >= 2.0
vLLM == 0.9.2
CUDA >= 11.8
cuDNN >= 8.6
# 可选依赖
prometheus_client # 监控
grafana # 可视化# 基础安装
pip install ucm
pip install ucm[full]git clone https://github.com/your-org/unified-cache-management.git
cd unified-cache-management
# 设置平台
export PLATFORM=cuda # 或 ascend, musa, maca
pip install -e .FROM nvidia/cuda:12.1-devel-ubuntu22.04
RUN apt-get update && apt-get install -y python3.10 python3-pip
COPY . /app
WORKDIR /app
RUN pip install -e .
EXPOSE 8000 9090
CMD ["python", "-m", "vllm.entrypoints.openai.api_server"]创建 ucm_config.yaml:
ucm_connectors:
- ucm_connector_name: "UcmPipelineStore"
ucm_connector_config:
# Pipeline 类型
store_pipeline: "Cache|Posix"
# 存储路径
storage_backends: "/data/ucm_cache"
# Pinned Memory 缓冲区
buffer_number: 2048
# Block 大小
block_size: 16
# Direct I/O
io_direct: true
ucm_sparse_config:
ESA:
sparse_ratio: 0.3
local_window_sz: 2
min_blocks: 4
metrics_config_path: "./metrics_config.yaml"
load_only_first_rank: falseucm_connectors:
- ucm_connector_name: "UcmPipelineStore"
ucm_connector_config:
store_pipeline: "Cache|NFS"
storage_backends:
- "/mnt/nfs/ucm_cache"
- "/mnt/nfs/ucm_cache_backup"
buffer_number: 4096
io_direct: true
# NFS 特定配置
nfs_config:
retry_count: 3
timeout_seconds: 30
multi_gpu:
enabled: true
strategy: "round_robin"
monitoring:
prometheus:
enabled: true
port: 9090
logging:
level: INFO
file: "/var/log/ucm/ucm.log"创建 start_server.sh:
#!/bin/bash
export PLATFORM=cuda
export UNIFIED_CACHE_LOG_LEVEL=INFO
export CUDA_VISIBLE_DEVICES=0,1,2,3
MODEL_PATH="/models/llama-7b"
UCM_CONFIG="./ucm_config.yaml"
# 启动 vLLM + UCM
python -m vllm.entrypoints.openai.api_server \
--model $MODEL_PATH \
--kv-connector "UCMConnector" \
--kv-connector-module-path "ucm.integration.vllm.ucm_connector" \
--kv-role "kv_both" \
--kv-connector-extra-config "{\"UCM_CONFIG_FILE\": \"$UCM_CONFIG\"}" \
--tensor-parallel-size 4 \
--max-model-len 8192 \
--gpu-memory-utilization 0.8 \
--port 8000chmod +x start_server.sh
./start_server.shcurl http://localhost:8000/health
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-7b",
"messages": [{"role": "user", "content": "Hello!"}]
}'# /etc/exports
/data/ucm_cache *(rw,sync,no_subtree_check,no_root_squash)
# 启动 NFS
sudo systemctl start nfs-kernel-serverucm_connectors:
- ucm_connector_name: "UcmPipelineStore"
ucm_connector_config:
store_pipeline: "Cache|NFS"
storage_backends: "192.168.1.100:/data/ucm_cache"
buffer_number: 4096# nginx.conf
upstream ucm_backend {
least_conn;
server worker1:8000;
server worker2:8000;
server worker3:8000;
}
server {
listen 80;
location / {
proxy_pass http://ucm_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
}
}apiVersion: apps/v1
kind: Deployment
metadata:
name: ucm-server
spec:
replicas: 3
selector:
matchLabels:
app: ucm
template:
metadata:
labels:
app: ucm
spec:
containers:
- name: ucm
image: ucm:latest
ports:
- containerPort: 8000
- containerPort: 9090
resources:
limits:
nvidia.com/gpu: 4
volumeMounts:
- name: cache-storage
mountPath: /data/ucm_cache
- name: config
mountPath: /app/config
env:
- name: UNIFIED_CACHE_LOG_LEVEL
value: "INFO"
volumes:
- name: cache-storage
persistentVolumeClaim:
claimName: ucm-cache-pvc
- name: config
configMap:
name: ucm-configapiVersion: v1
kind: Service
metadata:
name: ucm-service
spec:
type: LoadBalancer
ports:
- name: api
port: 8000
targetPort: 8000
- name: metrics
port: 9090
targetPort: 9090
selector:
app: ucmapiVersion: v1
kind: ConfigMap
metadata:
name: ucm-config
data:
ucm_config.yaml: |
ucm_connectors:
- ucm_connector_name: "UcmPipelineStore"
ucm_connector_config:
store_pipeline: "Cache|Posix"
storage_backends: "/data/ucm_cache"
buffer_number: 2048export PLATFORM=cuda
export CUDA_VISIBLE_DEVICES=0,1,2,3
pip install -e .export PLATFORM=ascend
export ASCEND_VISIBLE_DEVICES=0,1,2,3
pip install -e .platform:
type: ascend
device_ids: [0, 1, 2, 3]
ucm_connectors:
- ucm_connector_name: "UcmPipelineStore"
ucm_connector_config:
store_pipeline: "Cache|Posix"
storage_backends: "/data/ucm_cache"
# Ascend 特定配置
ascend_config:
enable_hccl: true