
本地运行大模型从未如此简单。Docker 化的 Ollama 让你的 AI 基础设施可复现、可扩展。
Ollama 是运行开源大语言模型的利器,但直接安装在宿主机上会带来环境管理的麻烦。Docker 化部署有以下优势:
优势 | 说明 |
|---|---|
可复现性 | 开发、测试、生产环境完全一致 |
隔离性 | 不污染宿主机,依赖管理清晰 |
可扩展性 | 轻松部署多实例,配合负载均衡 |
GPU 管理 | 统一配置,跨主机一致 |
最简单的启动方式,适合开发测试:
docker pull ollama/ollama
docker run -d \
--name ollama \
-p 11434:11434 \
ollama/ollama
docker exec -it ollama ollama pull llama3.2
docker exec -it ollama ollama run llama3.2
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"prompt": "为什么天空是蓝色的?"
}'
生产环境必须开启 GPU 加速。
distribution=(. /etc/os-release;echo ID
curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
docker run -d \
--name ollama-gpu \
--gpus all \
-p 11434:11434 \
ollama/ollama
docker run -d \
--name ollama-gpu \
--gpus '"device=0"' \
-p 11434:11434 \
ollama/ollama
docker exec ollama-gpu nvidia-smi
docker exec ollama-gpu ollama list
version: "3.8"
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
volumes:
ollama_data:
version: "3.8"
services:
ollama:
image: ollama/ollama:latest
container_name: ollama-gpu
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- NVIDIA_VISIBLE_DEVICES=all
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
interval: 30s
timeout: 10s
retries: 3
volumes:
ollama_data:
启动:
docker compose -f docker-compose.gpu.yml up -d
version: "3.8"
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
ports:
- "127.0.0.1:11434:11434" # 仅绑定本地,不暴露公网
volumes:
- ollama_data:/root/.ollama
environment:
- OLLAMA_HOST=0.0.0.0
- OLLAMA_NUM_PARALLEL=4
- OLLAMA_MAX_LOADED_MODELS=2
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
limits:
cpus: "8"
memory: 32G
restart: unless-stopped
logging:
driver: "json-file"
options:
max-size: "100m"
max-file: "3"
volumes:
ollama_data:
生产配置要点:
仅绑定本地,需通过反向代理暴露模型文件动辄几个 GB,管理策略很重要。
volumes:
ollama_data:
driver: local
services:
ollama:
volumes:
- /data/ollama/models:/root/.ollama
version: "3.8"
services:
ollama:
image: ollama/ollama:latest
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
interval: 30s
timeout: 10s
retries: 3
model-loader:
image: curlimages/curl:latest
depends_on:
ollama:
condition: service_healthy
entrypoint: >
sh -c "
curl -X POST http://ollama:11434/api/pull -d '{\"name\": \"llama3.2\"}' &&
curl -X POST http://ollama:11434/api/pull -d '{\"name\": \"codellama\"}'
"
restart: "no"
volumes:
ollama_data:
docker exec ollama du -sh /root/.ollama/models/*
docker run --rm \
-v ollama_data:/data \
-v $(pwd):/backup \
alpine tar czf /backup/ollama-backup.tar.gz /data
docker run --rm \
-v ollama_data:/data \
-v $(pwd):/backup \
alpine tar xzf /backup/ollama-backup.tar.gz -C /
services:
ollama:
image: ollama/ollama:latest
environment:
- OLLAMA_HOST=0.0.0.0 # 绑定地址
- OLLAMA_PORT=11434 # 端口
- OLLAMA_DEBUG=1 # 调试日志
- OLLAMA_NUM_PARALLEL=4 # 并行请求数
- OLLAMA_MAX_LOADED_MODELS=2 # 最大加载模型数
- OLLAMA_KEEP_ALIVE=300 # 模型内存保持时间(秒)
curl http://localhost:11434/api/tags
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"prompt": "用一句话解释 Docker",
"stream": false
}'
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"messages": [
{"role": "user", "content": "你好!"},
{"role": "assistant", "content": "你好!有什么可以帮助你的?"},
{"role": "user", "content": "什么是 Kubernetes?"}
]
}'
curl http://localhost:11434/api/embeddings -d '{
"model": "llama3.2",
"prompt": "Docker 容器是轻量级的"
}'
curl -X POST http://localhost:11434/api/pull -d '{"name": "codellama"}'
curl -X DELETE http://localhost:11434/api/delete -d '{"name": "codellama"}'
version: "3.8"
services:
nginx:
image: nginx:alpine
ports:
- "11434:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- ollama-1
- ollama-2
ollama-1:
image: ollama/ollama:latest
volumes:
- ollama_shared:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0"]
capabilities: [gpu]
environment:
- OLLAMA_HOST=0.0.0.0
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
interval: 30s
timeout: 10s
retries: 3
ollama-2:
image: ollama/ollama:latest
volumes:
- ollama_shared:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["1"]
capabilities: [gpu]
environment:
- OLLAMA_HOST=0.0.0.0
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
interval: 30s
timeout: 10s
retries: 3
volumes:
ollama_shared:
events {
worker_connections 1024;
}
http {
upstream ollama_cluster {
least_conn;
server ollama-1:11434;
server ollama-2:11434;
keepalive 32;
}
server {
listen 80;
proxy_connect_timeout 300s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
location / {
proxy_pass http://ollama_cluster;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
proxy_buffering off;
}
}
}
version: "3.8"
services:
ollama:
image: ollama/ollama:latest
volumes:
- ollama_data:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- OLLAMA_HOST=0.0.0.0
networks:
- app-network
app:
image: your-app:latest
ports:
- "8080:8080"
environment:
- OLLAMA_BASE_URL=http://ollama:11434
- OLLAMA_MODEL=llama3.2
depends_on:
- ollama
networks:
- app-network
redis:
image: redis:alpine
volumes:
- redis_data:/data
networks:
- app-network
networks:
app-network:
volumes:
ollama_data:
redis_data:
建议 | 说明 |
|---|---|
使用命名卷 | 模型持久化,容器重建不丢失 |
设置资源限制 | 防止内存溢出 |
配置健康检查 | 确保服务就绪后再启动依赖 |
绑定 localhost | 生产环境不直接暴露外网 |
建议 | 说明 |
|---|---|
安装 NVIDIA Container Toolkit | 所有 GPU 主机必需 |
指定 GPU 设备 | 多卡场景避免竞争 |
监控显存 | 大模型容易撑爆 VRAM |
建议 | 说明 |
|---|---|
使用 Docker 网络 | 服务间通信 |
设置合理超时 | LLM 推理可能耗时数秒 |
负载均衡 | 高可用部署 |
建议 | 说明 |
|---|---|
不直接暴露公网 | 必须有认证层 |
反向代理 + TLS | Nginx/Caddy |
非 root 用户 | 使用 user namespaces |
建议 | 说明 |
|---|---|
启用日志轮转 | 防止磁盘被日志撑满 |
监控 API 响应时间 | 检测性能退化 |
追踪 GPU 利用率 | 优化实例规格 |
从单容器开发测试,到 GPU 加速,再到多实例负载均衡,Docker 让 Ollama 的部署变得简单可控。
建议的学习路径:
1. 先用 CPU 模式跑通流程
2. 加上 GPU 加速体验性能提升
3. 用 Docker Compose 管理生产配置
4. 多实例部署应对高并发