一、本地部署LLM
# 1. 系统依赖
sudo apt update && sudo apt install -y python3 python-is-python3 git git-lfs
# 2. 安装 miniconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
source $HOME/miniconda/bin/activate
# 3. 创建环境 + PyTorch
conda create -n vllm python=3.12 -y
conda activate vllm
pip install torch==2.11.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
# 4. 安装 vLLM 依赖
pip install vllm transformers accelerate sentencepiece fastapi uvicorn modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple
# 5. 下载模型(ModelScope,国内快)
cd /root/models/ && git clone https://www.modelscope.cn/cyankiwi/Qwen3.5-27B-AWQ-4bit.git
# 6. 启动服务 .sh
#!/bin/bash
# ========== 环境配置 ==========
export CUDA_VISIBLE_DEVICES=0,1
export NCCL_DEBUG=WARN
export VLLM_LOGGING_LEVEL=INFO
# RTX 5090 + CUDA 12.8 优化
export TORCH_CUDA_ARCH_LIST="10.0"
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,max_split_size_mb:512"
# 性能优化(128核)
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export OMP_NUM_THREADS=32
# ========== 模型配置 ==========
MODEL_PATH="/root/models/Qwen3.5-27B-AWQ-4bit" # 改为 AWQ 模型路径
PORT=8000
TP_SIZE=2
echo "=== Qwen3.5-27B AWQ on 2x RTX 5090 ==="
nvidia-smi -L
echo "CPU: $(nproc) cores, Memory: $(free -h | awk '/^Mem:/ {print $2}')"
# ========== 启动 vLLM ==========
#!/bin/bash
# ========== 环境配置 ==========
export CUDA_VISIBLE_DEVICES=0,1
export NCCL_DEBUG=WARN
export VLLM_LOGGING_LEVEL=INFO
# RTX 5090 + CUDA 12.8 优化
export TORCH_CUDA_ARCH_LIST="10.0"
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,max_split_size_mb:512"
# 性能优化(128核)
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export OMP_NUM_THREADS=32
# ========== 模型配置 ==========
MODEL_PATH="/root/models/Qwen3.5-27B-AWQ-4bit" # 改为 AWQ 模型路径
PORT=8000
TP_SIZE=2
echo "=== Qwen3.5-27B AWQ on 2x RTX 5090 ==="
nvidia-smi -L
echo "CPU: $(nproc) cores, Memory: $(free -h | awk '/^Mem:/ {print $2}')"
# ========== 启动 vLLM ==========
python -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--served-model-name "qwen3.5-27b-awq" \
--api-key "sk-99aa2645fb281c0b0a70cb73b67711f7fd7ad78b223d1be8" \
--tensor-parallel-size "$TP_SIZE" \
--distributed-executor-backend mp \
--dtype auto \
--max-model-len 262144 \
--max-num-seqs 64 \
--gpu-memory-utilization 0.85 \
--enable-prefix-caching \
--enable-chunked-prefill \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--port "$PORT" \
--host 0.0.0.0
# 参数说明:
# --ssl-certfile /workspace/certs/cert.pem: 证书
# --ssl-keyfile /workspace/certs/privkey.pem
# --tensor-parallel-size 2: 双卡张量并行(必需)
# --gpu-memory-utilization 0.90: 留 10% 显存缓冲
# --enable-prefix-caching: 自动前缀缓存,加速多轮对话
# --max-num-seqs 256: 最大并发序列数
# 7. 测试LLM
curl http://localhost:8000/v1/chat/completions \
-H "Authorization: Bearer sk-99aa2645fb281c0b0a70cb73b67711f7fd7ad78b223d1be8" \
-H "Content-Type: application/json" \
-d '{"model":"qwen3.5-27b","messages":[{"role":"user","content":"你好"}]}'
二、部署embedding
# 1. onnx跑在CPU上
git clone https://www.modelscope.cn/onnx-community/ONNX_Qwen3-Embedding-0.6B.git
# 2. 挂载启动
services:
qwen3-embedding:
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.8.2
container_name: qwen3-embedding
restart: unless-stopped
ports:
- "8080:80"
volumes:
- /root/models/ONNX_Qwen3-Embedding-0.6B:/models/ONNX_Qwen3-Embedding-0.6B
command:
- --model-id
- /models/ONNX_Qwen3-Embedding-0.6B
- --dtype
- float32
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:80/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 180s
# 3. 测试
curl -X POST http://localhost:8080/embed \
-H "Content-Type: application/json" \
-d '{"inputs": "你好,世界"}'
或者SentenceTransformer,内存占用小
# 1. 下载模型
git clone https://www.modelscope.cn/Qwen/Qwen3-Embedding-0.6B.git
# 2. 安装依赖
conda create -n embedding python=3.12 -y
conda activate embedding
pip install -U fastapi uvicorn sentence-transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
# 3. 启动服务
cat > /workspace/embedding/embedding_server.py << 'EOF'
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from typing import List, Union
import os
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"
app = FastAPI()
logger.info("正在加载 Embedding 模型...")
model = SentenceTransformer(
"/root/models/Qwen3-Embedding-0.6B",
device="cpu"
)
logger.info("模型加载完成!")
# OpenAI API 格式的请求体
class EmbedRequest(BaseModel):
input: Union[str, List[str]] # 可以是字符串或字符串数组
model: str = "Qwen3-Embedding-0.6B"
encoding_format: str = "float"
class EmbedResponse(BaseModel):
object: str = "list"
data: List[dict]
model: str
usage: dict
@app.post("/v1/embeddings")
@app.post("/embeddings")
async def embeddings(request: EmbedRequest):
"""OpenAI 兼容的 Embedding API"""
# 处理 input 字段(可能是字符串或数组)
inputs = request.input if isinstance(request.input, list) else [request.input]
logger.info(f"收到请求,处理 {len(inputs)} 条文本")
# 生成向量
embeddings = model.encode(
inputs,
normalize_embeddings=True,
batch_size=32,
show_progress_bar=False
)
# 构造 OpenAI 格式的响应
data = [
{
"object": "embedding",
"index": i,
"embedding": emb.tolist()
}
for i, emb in enumerate(embeddings)
]
return {
"object": "list",
"data": data,
"model": request.model,
"usage": {
"prompt_tokens": sum(len(input.split()) for input in inputs),
"total_tokens": sum(len(input.split()) for input in inputs)
}
}
@app.get("/v1/models")
async def list_models():
return {
"object": "list",
"data": [
{
"id": "Qwen3-Embedding-0.6B",
"object": "model",
"owned_by": "local"
}
]
}
@app.get("/health")
async def health():
return {"status": "ok"}
@app.get("/")
async def root():
return {"service": "Qwen3-Embedding", "status": "running"}
if __name__ == "__main__":
import uvicorn
logger.info("启动 Embedding 服务,监听端口 8080")
uvicorn.run(app, host="0.0.0.0", port=8080)
EOF
# 测试
curl -X POST http://192.168.0.100:8080/v1/embeddings \
-H "Content-Type: application/json" \
-d '{"inputs": ["你好,世界", "今天天气怎么样"]}'
# 后台启动
nohup python embedding_server.py > embedding.log 2>&1 &
# 查看日志
tail -f embedding.log
# 创建 systemd 服务
cat > /etc/systemd/system/embedding.service << 'EOF'
[Unit]
Description=Qwen3 Embedding Service
After=network.target
[Service]
Type=simple
User=root
WorkingDirectory=/workspace/embedding
Environment="PATH=/root/miniconda3/envs/embedding/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
ExecStart=/root/miniconda3/envs/embedding/bin/python /workspace/embedding/embedding_server.py
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
EOF
# 重载并启动
systemctl daemon-reload
systemctl enable embedding
systemctl start embedding
systemctl status embedding
三、dify
# 1. LLM应用开发平台
curl -fsSL https://get.docker.com | bash -s docker --mirror Aliyun && \
mkdir -p /workspace/ && cd /workspace/ && \
git clone https://github.com/langgenius/dify.git && \
cd ./dify/docker && cp .env.example .env && \
docker compose up -d
Comments | NOTHING