Title here
Summary here
LoRA (Low-Rank Adaptation) 是一种高效的模型微调方法:
核心思想:
原始模型: 7B 参数
LoRA (r=16): ~10M 参数 (0.14%)
LoRA (r=64): ~40M 参数 (0.57%)
# 单个 LoRA 适配器
python -m sglang.launch_server \
--model meta-llama/Llama-3.1-8B-Instruct \
--lora-paths my-lora=/path/to/lora
# 多个 LoRA 适配器
python -m sglang.launch_server \
--model meta-llama/Llama-3.1-8B-Instruct \
--lora-paths \
code-lora=/path/to/code-lora \
chat-lora=/path/to/chat-lora \
math-lora=/path/to/math-lora \
--max-loras-per-batch 4class LoRAManager:
def __init__(self, base_model, max_loras):
self.base_model = base_model
self.max_loras = max_loras
# 已加载的适配器
self.loaded_loras: Dict[str, LoRAAdapter] = {}
# LRU 缓存管理
self.lru_cache = LRUCache(max_loras)
def load_lora(self, name: str, path: str):
"""加载 LoRA 适配器"""
if name in self.loaded_loras:
return self.loaded_loras[name]
# 检查容量
if len(self.loaded_loras) >= self.max_loras:
self._evict_lora()
# 加载权重
adapter = LoRAAdapter.from_path(path)
self.loaded_loras[name] = adapter
self.lru_cache.put(name)
return adapter
def _evict_lora(self):
"""淘汰最久未使用的适配器"""
lru_name = self.lru_cache.pop_lru()
del self.loaded_loras[lru_name]@dataclass
class LoRAAdapter:
name: str
rank: int
alpha: float
# 各层的 A、B 矩阵
lora_A: Dict[str, torch.Tensor] # layer_name -> (d, r)
lora_B: Dict[str, torch.Tensor] # layer_name -> (r, k)
@classmethod
def from_path(cls, path: str):
"""从路径加载"""
config = load_json(f"{path}/adapter_config.json")
weights = load_safetensors(f"{path}/adapter_model.safetensors")
lora_A = {}
lora_B = {}
for key, value in weights.items():
if "lora_A" in key:
layer_name = key.replace(".lora_A.weight", "")
lora_A[layer_name] = value
elif "lora_B" in key:
layer_name = key.replace(".lora_B.weight", "")
lora_B[layer_name] = value
return cls(
name=config["name"],
rank=config["r"],
alpha=config["lora_alpha"],
lora_A=lora_A,
lora_B=lora_B,
)# 指定 LoRA 适配器
response = client.chat.completions.create(
model="code-lora", # 使用 LoRA 名称
messages=[
{"role": "user", "content": "Write a Python function"}
]
)
# 或使用原生 API
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "Write a Python function",
"sampling_params": {
"lora_path": "code-lora"
}
}
)def forward_with_lora(self, x, lora_adapter):
"""带 LoRA 的前向计算"""
# 基础模型计算
base_output = self.base_layer(x)
# LoRA 计算
scaling = lora_adapter.alpha / lora_adapter.rank
lora_output = (x @ lora_adapter.lora_A.T @ lora_adapter.lora_B.T) * scaling
return base_output + lora_output# Segmented Gather Matrix-Vector multiplication
# 高效处理多 LoRA 批量计算
def sgmv_forward(
x: torch.Tensor, # (total_tokens, hidden_size)
lora_A_weights: List[torch.Tensor], # 多个 LoRA A 矩阵
lora_B_weights: List[torch.Tensor], # 多个 LoRA B 矩阵
seg_indices: torch.Tensor, # 分段索引
):
"""SGMV 内核:高效多 LoRA 计算"""
# 使用 Triton 实现的高效内核
# 避免多次内核启动开销
return triton_sgmv_kernel(
x, lora_A_weights, lora_B_weights, seg_indices
)内存占用:
基础模型: 14 GB (FP16)
每个 LoRA: ~20 MB
10 个 LoRA: 14 GB + 200 MB = 14.2 GB
class DynamicLoRALoader:
def __init__(self, max_loaded=4, max_cached=16):
self.max_loaded = max_loaded # GPU 上最多加载数
self.max_cached = max_cached # CPU 缓存数
self.gpu_loras = {} # GPU 上的适配器
self.cpu_cache = {} # CPU 缓存
def get_lora(self, name: str):
"""获取 LoRA,必要时加载"""
if name in self.gpu_loras:
return self.gpu_loras[name]
if name in self.cpu_cache:
# 从 CPU 移动到 GPU
return self._move_to_gpu(name)
# 从磁盘加载
return self._load_from_disk(name)
def _move_to_gpu(self, name):
"""CPU -> GPU"""
if len(self.gpu_loras) >= self.max_loaded:
self._evict_from_gpu()
lora = self.cpu_cache[name].to("cuda")
self.gpu_loras[name] = lora
return lora| 参数 | 说明 | 默认值 |
|---|---|---|
--lora-paths |
LoRA 路径映射 | 无 |
--max-loras-per-batch |
批次内最大 LoRA 数 | 4 |
--max-num-batched-tokens |
最大批处理 tokens | 8192 |
# 采样参数中指定 LoRA
sampling_params = {
"temperature": 0.7,
"max_tokens": 512,
"lora_path": "my-lora-name" # 指定使用的 LoRA
}import openai
client = openai.Client(base_url="http://localhost:30000/v1")
# 使用代码 LoRA
code_response = client.chat.completions.create(
model="code-lora",
messages=[
{"role": "user", "content": "实现快速排序算法"}
]
)
# 使用数学 LoRA
math_response = client.chat.completions.create(
model="math-lora",
messages=[
{"role": "user", "content": "证明勾股定理"}
]
)import asyncio
async def multi_lora_inference(requests):
"""多 LoRA 并发推理"""
tasks = []
for req in requests:
task = client.chat.completions.create(
model=req["lora"],
messages=req["messages"]
)
tasks.append(task)
results = await asyncio.gather(*tasks)
return results
# 请求列表
requests = [
{"lora": "code-lora", "messages": [...]},
{"lora": "chat-lora", "messages": [...]},
{"lora": "code-lora", "messages": [...]},
]
results = asyncio.run(multi_lora_inference(requests))| 场景 | 相对吞吐量 |
|---|---|
| 无 LoRA | 100% |
| 单 LoRA | 95-98% |
| 多 LoRA (同批次) | 85-95% |
| 频繁切换 | 70-85% |
# 检查 LoRA 是否加载
curl http://localhost:30000/v1/models
# 确认使用正确的模型名
# 模型名应为 --lora-paths 中指定的名称# 减少同时加载的 LoRA 数
python -m sglang.launch_server \
--model meta-llama/Llama-3.1-8B-Instruct \
--lora-paths my-lora=/path/to/lora \
--max-loras-per-batch 2 # 减少数量# 检查是否频繁切换
# 优化请求分组
# 考虑使用 LoRA 合并| 特性 | 支持情况 |
|---|---|
| 多适配器 | 支持 |
| 动态切换 | 支持 |
| 批量推理 | 支持 |
| SGMV 优化 | 支持 |
在下一章《多模态支持》中,我们将: