MBE 模型市场平台 - 部署配置指南
一、部署架构概览
┌─────────────────────────────────────────────────────────────────────────┐
│ 生产环境部署架构 │
├─────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ Nginx │ │
│ │ 负载均衡 │ │
│ └──────┬──────┘ │
│ │ │
│ ┌──────────────────┼──────────────────┐ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ API 实例1 │ │ API 实例2 │ │ API 实例N │ │
│ │ (FastAPI) │ │ (FastAPI) │ │ (FastAPI) │ │
│ └─────┬────┘ └─────┬────┘ └─────┬────┘ │
│ │ │ │ │
│ └──────────────────┼──────────────────┘ │
│ │ │
│ ┌────────────────────────────┼────────────────────────────┐ │
│ │ ▼ │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ Redis │ │ PostgreSQL │ │ MinIO │ │ │
│ │ │ 缓存/队列 │ │ 数据库 │ │ 对象存储 │ │ │
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
│ │ 数据持久层 │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Worker 集群 │ │
│ │ ┌────────────┐ ┌────────────┐ ┌────────────────┐ │ │
│ │ │CPU Worker×4│ │CPU Worker×4│ │ GPU Worker×N │ │ │
│ │ │ 分块/索引 │ │ 分块/索引 │ │ 训练/推理 │ │ │
│ │ └────────────┘ └────────────┘ └────────────────┘ │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────┘
二、部署方案选择
方案对比
| 方案 |
适用规模 |
复杂度 |
成本 |
扩展性 |
| 单机 Docker Compose |
小型 (1-50用户) |
⭐ |
低 |
有限 |
| 多机 Docker Swarm |
中型 (50-200用户) |
⭐⭐ |
中 |
良好 |
| Kubernetes |
大型 (200+用户) |
⭐⭐⭐⭐ |
高 |
优秀 |
| 混合云 |
弹性需求 |
⭐⭐⭐ |
灵活 |
优秀 |
三、方案一:单机部署(推荐起步)
3.1 硬件配置
# 最小配置(开发/测试)
最小配置:
CPU: 4 核
内存: 8 GB
存储: 100 GB SSD
GPU: 无
带宽: 10 Mbps
预估成本: 500 元/月
# 推荐配置(小型生产)
推荐配置:
CPU: 8 核 (Intel Xeon / AMD EPYC)
内存: 32 GB DDR4
存储: 500 GB NVMe SSD
GPU: RTX 3060 12GB 或 RTX 4060 Ti
带宽: 50 Mbps
预估成本: 2000-3000 元/月
# 高配置(中型生产)
高配置:
CPU: 16 核
内存: 64 GB
存储: 1 TB NVMe SSD
GPU: RTX 4090 24GB
带宽: 100 Mbps
预估成本: 5000-8000 元/月
3.2 Docker Compose 配置
# docker-compose.prod.yml
version: '3.8'
services:
# ==================== 核心服务 ====================
mbe-api:
build:
context: .
dockerfile: Dockerfile
image: mbe-api:latest
container_name: mbe-api
restart: always
ports:
- "8000:8000"
environment:
- ENV=production
- DEBUG=false
- REDIS_URL=redis://redis:6379/0
- DATABASE_URL=postgresql://mbe:${DB_PASSWORD}@postgres:5432/mbe
- MINIO_ENDPOINT=minio:9000
- MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY}
- MINIO_SECRET_KEY=${MINIO_SECRET_KEY}
- LLM_PROVIDER=${LLM_PROVIDER}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- ZHIPU_API_KEY=${ZHIPU_API_KEY}
volumes:
- ./data:/app/data
- ./logs:/app/logs
depends_on:
- redis
- postgres
- minio
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/api/health"]
interval: 30s
timeout: 10s
retries: 3
deploy:
resources:
limits:
cpus: '2'
memory: 4G
reservations:
cpus: '1'
memory: 2G
# ==================== 数据存储 ====================
redis:
image: redis:7-alpine
container_name: mbe-redis
restart: always
command: redis-server --appendonly yes --maxmemory 2gb --maxmemory-policy allkeys-lru
volumes:
- redis_data:/data
ports:
- "6379:6379"
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 3
deploy:
resources:
limits:
memory: 2G
postgres:
image: postgres:15-alpine
container_name: mbe-postgres
restart: always
environment:
- POSTGRES_USER=mbe
- POSTGRES_PASSWORD=${DB_PASSWORD}
- POSTGRES_DB=mbe
volumes:
- postgres_data:/var/lib/postgresql/data
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U mbe"]
interval: 10s
timeout: 5s
retries: 3
deploy:
resources:
limits:
memory: 2G
minio:
image: minio/minio:latest
container_name: mbe-minio
restart: always
command: server /data --console-address ":9001"
environment:
- MINIO_ROOT_USER=${MINIO_ACCESS_KEY}
- MINIO_ROOT_PASSWORD=${MINIO_SECRET_KEY}
volumes:
- minio_data:/data
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 10s
retries: 3
# ==================== 任务处理 ====================
celery-worker:
build:
context: .
dockerfile: Dockerfile
image: mbe-api:latest
container_name: mbe-celery-worker
restart: always
command: celery -A src.tasks.celery_app worker --loglevel=info --concurrency=4
environment:
- ENV=production
- REDIS_URL=redis://redis:6379/0
- DATABASE_URL=postgresql://mbe:${DB_PASSWORD}@postgres:5432/mbe
volumes:
- ./data:/app/data
- ./logs:/app/logs
depends_on:
- redis
- postgres
deploy:
resources:
limits:
cpus: '4'
memory: 8G
reservations:
cpus: '2'
memory: 4G
celery-training:
build:
context: .
dockerfile: Dockerfile.gpu
image: mbe-api:gpu
container_name: mbe-celery-training
restart: always
command: celery -A src.tasks.celery_app worker --loglevel=info --concurrency=1 -Q training
environment:
- ENV=production
- REDIS_URL=redis://redis:6379/0
- DATABASE_URL=postgresql://mbe:${DB_PASSWORD}@postgres:5432/mbe
- CUDA_VISIBLE_DEVICES=0
volumes:
- ./data:/app/data
- ./logs:/app/logs
- ./models:/app/models
depends_on:
- redis
- postgres
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
celery-beat:
build:
context: .
dockerfile: Dockerfile
image: mbe-api:latest
container_name: mbe-celery-beat
restart: always
command: celery -A src.tasks.celery_app beat --loglevel=info
environment:
- ENV=production
- REDIS_URL=redis://redis:6379/0
depends_on:
- redis
flower:
image: mher/flower:latest
container_name: mbe-flower
restart: always
command: celery --broker=redis://redis:6379/0 flower --port=5555
ports:
- "5555:5555"
depends_on:
- redis
# ==================== 监控(可选)====================
prometheus:
image: prom/prometheus:latest
container_name: mbe-prometheus
restart: always
volumes:
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
ports:
- "9090:9090"
profiles:
- monitoring
grafana:
image: grafana/grafana:latest
container_name: mbe-grafana
restart: always
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
volumes:
- grafana_data:/var/lib/grafana
ports:
- "3000:3000"
profiles:
- monitoring
volumes:
redis_data:
postgres_data:
minio_data:
prometheus_data:
grafana_data:
networks:
default:
name: mbe-network
3.3 环境变量配置
# .env.production
# ==================== 基础配置 ====================
ENV=production
DEBUG=false
HOST=0.0.0.0
PORT=8000
# ==================== 数据库 ====================
DB_PASSWORD=your_secure_password_here
DATABASE_URL=postgresql://mbe:${DB_PASSWORD}@postgres:5432/mbe
# ==================== Redis ====================
REDIS_URL=redis://redis:6379/0
# ==================== MinIO ====================
MINIO_ENDPOINT=minio:9000
MINIO_ACCESS_KEY=mbe_minio_admin
MINIO_SECRET_KEY=your_minio_secret_key
# ==================== LLM 配置 ====================
LLM_PROVIDER=zhipu # openai, zhipu, qwen, deepseek
OPENAI_API_KEY=sk-xxx
OPENAI_BASE_URL=https://api.openai.com/v1
ZHIPU_API_KEY=xxx.xxx
QWEN_API_KEY=sk-xxx
# ==================== Embedding ====================
EMBEDDING_PROVIDER=zhipu # openai, zhipu, local
EMBEDDING_MODEL=embedding-3
EMBEDDING_DIMENSION=1024
# ==================== 安全配置 ====================
SECRET_KEY=your_very_long_secret_key_at_least_32_chars
JWT_ALGORITHM=HS256
JWT_EXPIRE_HOURS=24
# ==================== 监控 ====================
GRAFANA_PASSWORD=admin_password
# ==================== 邮件(可选)====================
SMTP_HOST=smtp.example.com
SMTP_PORT=465
SMTP_USER=noreply@example.com
SMTP_PASSWORD=xxx
3.4 Nginx 配置
# /etc/nginx/sites-available/mbe.conf
upstream mbe_api {
server 127.0.0.1:8000;
keepalive 32;
}
# HTTP 重定向到 HTTPS
server {
listen 80;
server_name your-domain.com;
return 301 https://$server_name$request_uri;
}
# HTTPS 配置
server {
listen 443 ssl http2;
server_name your-domain.com;
# SSL 证书
ssl_certificate /etc/letsencrypt/live/your-domain.com/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/your-domain.com/privkey.pem;
ssl_session_timeout 1d;
ssl_session_cache shared:SSL:50m;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256;
ssl_prefer_server_ciphers off;
# 安全头
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-XSS-Protection "1; mode=block" always;
add_header Strict-Transport-Security "max-age=31536000" always;
# 日志
access_log /var/log/nginx/mbe_access.log;
error_log /var/log/nginx/mbe_error.log;
# 客户端上传限制(大文件上传)
client_max_body_size 500M;
client_body_timeout 300s;
# API 代理
location / {
proxy_pass http://mbe_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Connection "";
# 超时配置
proxy_connect_timeout 60s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
# WebSocket 支持
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
# 静态文件
location /static {
alias /var/www/mbe/static;
expires 30d;
add_header Cache-Control "public, immutable";
}
# 健康检查
location /health {
proxy_pass http://mbe_api/api/health;
access_log off;
}
# Flower 监控(限制访问)
location /flower/ {
auth_basic "Restricted";
auth_basic_user_file /etc/nginx/.htpasswd;
proxy_pass http://127.0.0.1:5555/;
proxy_set_header Host $host;
}
}
四、方案二:多机部署
4.1 架构设计
┌─────────────────────────────────────────────────────────────┐
│ 多机部署架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 节点1 (API + Worker) 节点2 (API + Worker) │
│ ┌─────────────────────┐ ┌─────────────────────┐ │
│ │ CPU: 8核 │ │ CPU: 8核 │ │
│ │ 内存: 32GB │ │ 内存: 32GB │ │
│ │ ─────────────────── │ │ ─────────────────── │ │
│ │ • MBE API ×2 │ │ • MBE API ×2 │ │
│ │ • Celery Worker ×4 │ │ • Celery Worker ×4 │ │
│ │ • Nginx │ │ • Nginx │ │
│ └─────────────────────┘ └─────────────────────┘ │
│ │
│ 节点3 (数据存储) 节点4 (GPU 训练) │
│ ┌─────────────────────┐ ┌─────────────────────┐ │
│ │ CPU: 8核 │ │ CPU: 16核 │ │
│ │ 内存: 64GB │ │ 内存: 64GB │ │
│ │ 存储: 2TB SSD │ │ GPU: RTX 4090 ×1 │ │
│ │ ─────────────────── │ │ ─────────────────── │ │
│ │ • Redis (主) │ │ • Training Worker │ │
│ │ • PostgreSQL (主) │ │ • Inference Service │ │
│ │ • MinIO │ │ │ │
│ └─────────────────────┘ └─────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
4.2 节点配置清单
# 节点配置
nodes:
# API 节点(可水平扩展)
api_nodes:
count: 2
spec:
cpu: 8 cores
memory: 32 GB
storage: 100 GB SSD
network: 1 Gbps
services:
- mbe-api (2 replicas)
- celery-worker (4 workers)
- nginx (load balancer)
# 数据节点
data_node:
count: 1
spec:
cpu: 8 cores
memory: 64 GB
storage: 2 TB NVMe SSD (RAID 1)
network: 10 Gbps
services:
- redis (8GB memory)
- postgresql
- minio
# GPU 节点
gpu_node:
count: 1-N
spec:
cpu: 16 cores
memory: 64 GB
storage: 500 GB NVMe
gpu: RTX 4090 24GB
services:
- celery-training (1 worker per GPU)
- inference-service
# 网络配置
network:
internal_network: 10.0.0.0/24
api_nodes: 10.0.0.10-19
data_node: 10.0.0.100
gpu_nodes: 10.0.0.200-209
4.3 Docker Swarm 部署
# docker-stack.yml
version: '3.8'
services:
mbe-api:
image: mbe-api:latest
deploy:
replicas: 4
placement:
constraints:
- node.labels.role == api
resources:
limits:
cpus: '2'
memory: 4G
update_config:
parallelism: 1
delay: 10s
order: start-first
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
networks:
- mbe-overlay
configs:
- source: mbe_config
target: /app/.env
celery-worker:
image: mbe-api:latest
command: celery -A src.tasks.celery_app worker -l info -c 4
deploy:
replicas: 8
placement:
constraints:
- node.labels.role == api
resources:
limits:
cpus: '4'
memory: 8G
networks:
- mbe-overlay
celery-training:
image: mbe-api:gpu
command: celery -A src.tasks.celery_app worker -l info -c 1 -Q training
deploy:
replicas: 1
placement:
constraints:
- node.labels.gpu == true
resources:
reservations:
generic_resources:
- discrete_resource_spec:
kind: 'NVIDIA-GPU'
value: 1
networks:
- mbe-overlay
redis:
image: redis:7-alpine
command: redis-server --appendonly yes --maxmemory 8gb
deploy:
replicas: 1
placement:
constraints:
- node.labels.role == data
volumes:
- redis_data:/data
networks:
- mbe-overlay
postgres:
image: postgres:15-alpine
deploy:
replicas: 1
placement:
constraints:
- node.labels.role == data
volumes:
- postgres_data:/var/lib/postgresql/data
networks:
- mbe-overlay
networks:
mbe-overlay:
driver: overlay
attachable: true
volumes:
redis_data:
postgres_data:
configs:
mbe_config:
external: true
五、性能调优
5.1 系统调优
# /etc/sysctl.d/99-mbe.conf
# 网络优化
net.core.somaxconn = 65535
net.core.netdev_max_backlog = 65535
net.ipv4.tcp_max_syn_backlog = 65535
net.ipv4.tcp_fin_timeout = 10
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_keepalive_time = 600
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_probes = 3
# 内存优化
vm.swappiness = 10
vm.overcommit_memory = 1
# 文件描述符
fs.file-max = 2097152
fs.nr_open = 2097152
# 应用: sysctl -p /etc/sysctl.d/99-mbe.conf
# /etc/security/limits.d/99-mbe.conf
* soft nofile 1048576
* hard nofile 1048576
* soft nproc 65535
* hard nproc 65535
5.2 Redis 调优
# redis.conf
# 内存配置
maxmemory 8gb
maxmemory-policy allkeys-lru
# 持久化
appendonly yes
appendfsync everysec
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
# 性能
tcp-backlog 511
tcp-keepalive 300
timeout 0
# 连接数
maxclients 10000
# 慢查询日志
slowlog-log-slower-than 10000
slowlog-max-len 128
5.3 PostgreSQL 调优
# postgresql.conf
# 连接配置
max_connections = 200
superuser_reserved_connections = 3
# 内存配置
shared_buffers = 8GB # 总内存的 25%
effective_cache_size = 24GB # 总内存的 75%
maintenance_work_mem = 1GB
work_mem = 64MB
# WAL 配置
wal_buffers = 64MB
checkpoint_completion_target = 0.9
max_wal_size = 4GB
min_wal_size = 1GB
# 查询优化
random_page_cost = 1.1 # SSD
effective_io_concurrency = 200 # SSD
# 日志
log_min_duration_statement = 1000
log_checkpoints = on
log_lock_waits = on
5.4 Celery 调优
# celery_config.py
# Worker 配置
CELERY_WORKER_CONCURRENCY = 4 # CPU 核心数
CELERY_WORKER_PREFETCH_MULTIPLIER = 4 # 预取任务数
CELERY_WORKER_MAX_TASKS_PER_CHILD = 1000 # 防止内存泄漏
# 任务配置
CELERY_TASK_ACKS_LATE = True # 任务完成后确认
CELERY_TASK_REJECT_ON_WORKER_LOST = True
CELERY_TASK_TIME_LIMIT = 3600 # 1小时超时
CELERY_TASK_SOFT_TIME_LIMIT = 3300 # 55分钟软超时
# 结果配置
CELERY_RESULT_EXPIRES = 86400 # 结果保留24小时
CELERY_RESULT_BACKEND_TRANSPORT_OPTIONS = {
'visibility_timeout': 43200, # 12小时
}
# 序列化
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_ACCEPT_CONTENT = ['json']
六、监控与告警
6.1 Prometheus 配置
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- '/etc/prometheus/rules/*.yml'
scrape_configs:
- job_name: 'mbe-api'
static_configs:
- targets: ['mbe-api:8000']
metrics_path: '/metrics'
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'nvidia-gpu'
static_configs:
- targets: ['dcgm-exporter:9400']
6.2 告警规则
# alert_rules.yml
groups:
- name: mbe_alerts
rules:
# API 高延迟
- alert: HighAPILatency
expr: histogram_quantile(0.95, http_request_duration_seconds_bucket) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "API 响应延迟过高"
description: "95分位延迟超过2秒"
# 错误率过高
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "错误率过高"
description: "5xx 错误率超过 5%"
# Redis 内存不足
- alert: RedisMemoryHigh
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Redis 内存使用过高"
# 磁盘空间不足
- alert: DiskSpaceLow
expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘空间不足 15%"
# GPU 利用率低(训练任务可能卡住)
- alert: GPUUtilizationLow
expr: DCGM_FI_DEV_GPU_UTIL < 10
for: 30m
labels:
severity: warning
annotations:
summary: "GPU 利用率过低,训练可能卡住"
# Celery 队列积压
- alert: CeleryQueueBacklog
expr: celery_queue_length > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Celery 任务队列积压"
七、备份策略
7.1 备份脚本
#!/bin/bash
# backup.sh - MBE 数据备份脚本
set -e
BACKUP_DIR="/backup/mbe"
DATE=$(date +%Y%m%d_%H%M%S)
RETENTION_DAYS=7
# 创建备份目录
mkdir -p ${BACKUP_DIR}/${DATE}
echo "开始备份 MBE 数据..."
# 1. PostgreSQL 备份
echo "备份 PostgreSQL..."
docker exec mbe-postgres pg_dump -U mbe mbe | gzip > ${BACKUP_DIR}/${DATE}/postgres.sql.gz
# 2. Redis 备份
echo "备份 Redis..."
docker exec mbe-redis redis-cli BGSAVE
sleep 5
docker cp mbe-redis:/data/dump.rdb ${BACKUP_DIR}/${DATE}/redis.rdb
# 3. MinIO 数据备份(增量)
echo "备份 MinIO..."
docker run --rm -v mbe_minio_data:/data -v ${BACKUP_DIR}/${DATE}:/backup alpine \
tar czf /backup/minio.tar.gz /data
# 4. 配置文件备份
echo "备份配置文件..."
cp -r /opt/mbe/.env* ${BACKUP_DIR}/${DATE}/
cp -r /opt/mbe/docker-compose*.yml ${BACKUP_DIR}/${DATE}/
# 5. 模型文件备份
echo "备份模型文件..."
tar czf ${BACKUP_DIR}/${DATE}/models.tar.gz /opt/mbe/models/
# 6. 创建校验文件
echo "生成校验和..."
cd ${BACKUP_DIR}/${DATE}
sha256sum * > checksums.sha256
# 7. 清理旧备份
echo "清理 ${RETENTION_DAYS} 天前的备份..."
find ${BACKUP_DIR} -type d -mtime +${RETENTION_DAYS} -exec rm -rf {} +
# 8. 上传到远程存储(可选)
# aws s3 sync ${BACKUP_DIR}/${DATE} s3://your-bucket/mbe-backup/${DATE}
echo "备份完成: ${BACKUP_DIR}/${DATE}"
7.2 定时任务
# crontab -e
# 每天凌晨 3 点执行备份
0 3 * * * /opt/mbe/scripts/backup.sh >> /var/log/mbe-backup.log 2>&1
# 每小时检查服务健康
0 * * * * /opt/mbe/scripts/health_check.sh >> /var/log/mbe-health.log 2>&1
八、快速启动命令
# 1. 克隆代码
git clone https://github.com/your-org/mbe.git
cd mbe
# 2. 配置环境变量
cp .env.example .env.production
vim .env.production # 编辑配置
# 3. 启动服务
docker-compose -f docker-compose.prod.yml up -d
# 4. 检查状态
docker-compose -f docker-compose.prod.yml ps
docker-compose -f docker-compose.prod.yml logs -f mbe-api
# 5. 初始化数据库
docker exec mbe-api python -m src.storage.init_db
# 6. 创建管理员
docker exec -it mbe-api python -c "
from src.users.service import create_admin
create_admin('admin@example.com', 'admin_password')
"
# 7. 验证服务
curl http://localhost:8000/api/health
九、常见问题排查
| 问题 |
可能原因 |
解决方案 |
| API 响应慢 |
Redis 连接数满 |
增加 maxclients 配置 |
| 训练任务卡住 |
GPU 显存不足 |
减小 batch_size 或使用梯度累积 |
| 文件上传失败 |
Nginx body_size 限制 |
增大 client_max_body_size |
| Celery 任务丢失 |
Worker 崩溃 |
启用 acks_late 和 reject_on_worker_lost |
| 磁盘空间不足 |
日志/检查点积累 |
配置日志轮转,清理旧检查点 |
文档版本: v1.0
更新日期: 2026-01-23