init
This commit is contained in:
commit
5cb24a8eed
228
.env.example
Normal file
228
.env.example
Normal file
@ -0,0 +1,228 @@
|
||||
# Environment Configuration Template
|
||||
# Copy this file to .env and update with your actual values
|
||||
|
||||
# ================================
|
||||
# HETZNER CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Hetzner Cloud API Token (get from Hetzner Cloud Console)
|
||||
HCLOUD_TOKEN=your_hcloud_token_here
|
||||
|
||||
# Hetzner Robot API credentials (for dedicated servers)
|
||||
ROBOT_API_USER=your_robot_username
|
||||
ROBOT_API_PASSWORD=your_robot_password
|
||||
|
||||
# ================================
|
||||
# SSH CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# SSH public key content (paste the full key)
|
||||
SSH_PUBLIC_KEY="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC7... your-email@domain.com"
|
||||
|
||||
# Path to SSH private key
|
||||
SSH_PRIVATE_KEY_PATH=~/.ssh/hetzner_key
|
||||
|
||||
# SSH key name in Hetzner Cloud
|
||||
SSH_KEY_NAME=ai-infrastructure
|
||||
|
||||
# ================================
|
||||
# DOMAIN CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Domain for API endpoint (optional, can use IP)
|
||||
API_DOMAIN=api.yourdomain.com
|
||||
|
||||
# Domain for monitoring dashboard (optional)
|
||||
MONITORING_DOMAIN=monitoring.yourdomain.com
|
||||
|
||||
# ================================
|
||||
# ENVIRONMENT SETTINGS
|
||||
# ================================
|
||||
|
||||
# Deployment environment (dev, staging, production)
|
||||
ENVIRONMENT=production
|
||||
|
||||
# Project name for resource tagging
|
||||
PROJECT_NAME=ai-infrastructure
|
||||
|
||||
# Cost center for billing tracking
|
||||
COST_CENTER=engineering
|
||||
|
||||
# ================================
|
||||
# SECURITY CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Grafana admin password (change this!)
|
||||
GRAFANA_ADMIN_PASSWORD=change_this_secure_password
|
||||
|
||||
# Ansible Vault password (change this!)
|
||||
ANSIBLE_VAULT_PASSWORD=change_this_vault_password
|
||||
|
||||
# Allowed IP ranges for SSH access (comma-separated CIDR blocks)
|
||||
# Use 0.0.0.0/0 for testing only, restrict in production
|
||||
ALLOWED_SSH_CIDRS=203.0.113.0/24,198.51.100.0/24
|
||||
|
||||
# ================================
|
||||
# GITLAB CI/CD CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# GitLab personal access token (for CI/CD)
|
||||
GITLAB_TOKEN=your_gitlab_token_here
|
||||
|
||||
# GitLab project URL for ansible-pull
|
||||
ANSIBLE_REPO_URL=https://gitlab.com/yourorg/ai-infrastructure.git
|
||||
|
||||
# GitLab deploy token (for repository access)
|
||||
GITLAB_DEPLOY_TOKEN=your_deploy_token
|
||||
|
||||
# ================================
|
||||
# AUTO-SCALING CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Minimum number of GEX44 servers
|
||||
MIN_GEX44_COUNT=1
|
||||
|
||||
# Maximum number of GEX44 servers
|
||||
MAX_GEX44_COUNT=5
|
||||
|
||||
# GPU utilization threshold for scaling up (0.0-1.0)
|
||||
SCALE_UP_THRESHOLD=0.8
|
||||
|
||||
# GPU utilization threshold for scaling down (0.0-1.0)
|
||||
SCALE_DOWN_THRESHOLD=0.3
|
||||
|
||||
# ================================
|
||||
# MODEL CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Default model to deploy
|
||||
DEFAULT_MODEL=mixtral-8x7b
|
||||
|
||||
# Models to download and cache
|
||||
MODELS_TO_DOWNLOAD=mixtral-8x7b,llama2-70b,codellama-34b
|
||||
|
||||
# HuggingFace token (for private models, optional)
|
||||
HUGGINGFACE_TOKEN=your_hf_token
|
||||
|
||||
# ================================
|
||||
# MONITORING CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Prometheus data retention period
|
||||
PROMETHEUS_RETENTION=30d
|
||||
|
||||
# Grafana data retention period
|
||||
GRAFANA_RETENTION=90d
|
||||
|
||||
# Alert email address
|
||||
ALERT_EMAIL=alerts@yourdomain.com
|
||||
|
||||
# Slack webhook URL for alerts (optional)
|
||||
SLACK_WEBHOOK_URL=https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX
|
||||
|
||||
# ================================
|
||||
# BACKUP CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Enable automated backups
|
||||
BACKUP_ENABLED=true
|
||||
|
||||
# Backup retention period (days)
|
||||
BACKUP_RETENTION_DAYS=7
|
||||
|
||||
# Backup storage location (S3 bucket, etc.)
|
||||
BACKUP_STORAGE_URL=s3://your-backup-bucket/ai-infrastructure
|
||||
|
||||
# ================================
|
||||
# PERFORMANCE TUNING
|
||||
# ================================
|
||||
|
||||
# Load balancer server type
|
||||
LOAD_BALANCER_TYPE=cx31
|
||||
|
||||
# API Gateway server type
|
||||
API_GATEWAY_TYPE=cx31
|
||||
|
||||
# Monitoring server type
|
||||
MONITORING_TYPE=cx21
|
||||
|
||||
# Additional storage size (GB)
|
||||
ADDITIONAL_STORAGE_SIZE=500
|
||||
|
||||
# ================================
|
||||
# DEVELOPMENT/TESTING
|
||||
# ================================
|
||||
|
||||
# API URL for testing (set automatically in CI/CD)
|
||||
API_URL=https://api.yourdomain.com
|
||||
|
||||
# Enable development tools
|
||||
DEV_TOOLS_ENABLED=false
|
||||
|
||||
# Skip SSL verification for testing
|
||||
SKIP_SSL_VERIFY=false
|
||||
|
||||
# ================================
|
||||
# COST TRACKING
|
||||
# ================================
|
||||
|
||||
# Currency for cost reporting
|
||||
COST_CURRENCY=EUR
|
||||
|
||||
# Cost tracking tags
|
||||
COST_TAGS=project:ai-infrastructure,team:engineering,environment:production
|
||||
|
||||
# Budget alert threshold (monthly EUR)
|
||||
BUDGET_ALERT_THRESHOLD=1000
|
||||
|
||||
# ================================
|
||||
# ADVANCED CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Enable cloud load balancer (alternative to HAProxy)
|
||||
ENABLE_CLOUD_LB=false
|
||||
|
||||
# Enable floating IP for HA
|
||||
ENABLE_FLOATING_IP=false
|
||||
|
||||
# Enable advanced monitoring
|
||||
ENABLE_ADVANCED_MONITORING=true
|
||||
|
||||
# Network zone
|
||||
NETWORK_ZONE=eu-central
|
||||
|
||||
# Private network CIDR
|
||||
PRIVATE_NETWORK_CIDR=10.0.0.0/16
|
||||
|
||||
# GEX44 subnet
|
||||
GEX44_SUBNET=10.0.1.0/24
|
||||
|
||||
# Cloud subnet
|
||||
CLOUD_SUBNET=10.0.2.0/24
|
||||
|
||||
# ================================
|
||||
# TERRAFORM BACKEND
|
||||
# ================================
|
||||
|
||||
# Terraform state backend type (gitlab, s3, local)
|
||||
TF_BACKEND_TYPE=gitlab
|
||||
|
||||
# S3 backend configuration (if using S3)
|
||||
TF_STATE_BUCKET=your-terraform-state-bucket
|
||||
TF_STATE_REGION=eu-central-1
|
||||
|
||||
# GitLab backend configuration (if using GitLab)
|
||||
TF_GITLAB_PROJECT_ID=12345
|
||||
|
||||
# ================================
|
||||
# LOGGING CONFIGURATION
|
||||
# ================================
|
||||
|
||||
# Log level (DEBUG, INFO, WARNING, ERROR)
|
||||
LOG_LEVEL=INFO
|
||||
|
||||
# Centralized logging (optional)
|
||||
LOG_AGGREGATION_URL=https://logs.yourdomain.com
|
||||
|
||||
# Log retention period (days)
|
||||
LOG_RETENTION_DAYS=30
|
||||
504
.gitlab-ci.yml
Normal file
504
.gitlab-ci.yml
Normal file
@ -0,0 +1,504 @@
|
||||
# GitLab CI/CD Pipeline for AI Infrastructure
|
||||
# Production-ready pipeline with comprehensive testing and deployment
|
||||
|
||||
stages:
|
||||
- validate
|
||||
- test
|
||||
- security
|
||||
- deploy-staging
|
||||
- integration-test
|
||||
- deploy-production
|
||||
- post-deploy
|
||||
|
||||
variables:
|
||||
TF_ROOT: terraform
|
||||
ANSIBLE_ROOT: ansible
|
||||
TF_VERSION: "1.6.0"
|
||||
ANSIBLE_VERSION: "8.5.0"
|
||||
PYTHON_VERSION: "3.11"
|
||||
GO_VERSION: "1.21"
|
||||
|
||||
# Terraform state configuration
|
||||
TF_STATE_NAME: ai-infrastructure
|
||||
TF_CACHE_KEY: "$CI_COMMIT_REF_SLUG"
|
||||
|
||||
# Security scanning
|
||||
SECURITY_SCAN_ENABLED: "true"
|
||||
|
||||
# Performance testing
|
||||
LOAD_TEST_ENABLED: "true"
|
||||
|
||||
# Deployment settings
|
||||
DEPLOY_TIMEOUT: "1800" # 30 minutes
|
||||
|
||||
# Templates for reusability
|
||||
.terraform_base: &terraform_base
|
||||
image: hashicorp/terraform:$TF_VERSION
|
||||
before_script:
|
||||
- cd $TF_ROOT
|
||||
- terraform --version
|
||||
- |
|
||||
cat << EOF > backend.tf
|
||||
terraform {
|
||||
backend "http" {
|
||||
address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME"
|
||||
lock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock"
|
||||
unlock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock"
|
||||
username = "gitlab-ci-token"
|
||||
password = "$CI_JOB_TOKEN"
|
||||
lock_method = "POST"
|
||||
unlock_method = "DELETE"
|
||||
retry_wait_min = 5
|
||||
}
|
||||
}
|
||||
EOF
|
||||
- terraform init
|
||||
|
||||
.ansible_base: &ansible_base
|
||||
image: quay.io/ansible/ansible-runner:latest
|
||||
before_script:
|
||||
- cd $ANSIBLE_ROOT
|
||||
- ansible --version
|
||||
- ansible-galaxy install -r requirements.yml
|
||||
- echo "$ANSIBLE_VAULT_PASSWORD" > /tmp/.vault-pass
|
||||
- chmod 600 /tmp/.vault-pass
|
||||
|
||||
.docker_base: &docker_base
|
||||
image: docker:latest
|
||||
services:
|
||||
- docker:dind
|
||||
variables:
|
||||
DOCKER_HOST: tcp://docker:2376
|
||||
DOCKER_TLS_CERTDIR: "/certs"
|
||||
|
||||
# Cache configurations
|
||||
.terraform_cache: &terraform_cache
|
||||
cache:
|
||||
key: terraform-$CI_COMMIT_REF_SLUG
|
||||
paths:
|
||||
- $TF_ROOT/.terraform/
|
||||
- $TF_ROOT/.terraform.lock.hcl
|
||||
|
||||
.ansible_cache: &ansible_cache
|
||||
cache:
|
||||
key: ansible-$CI_COMMIT_REF_SLUG
|
||||
paths:
|
||||
- $ANSIBLE_ROOT/collections/
|
||||
- $ANSIBLE_ROOT/roles/
|
||||
|
||||
# ================================
|
||||
# VALIDATION STAGE
|
||||
# ================================
|
||||
|
||||
terraform_format_check:
|
||||
<<: *terraform_base
|
||||
<<: *terraform_cache
|
||||
stage: validate
|
||||
script:
|
||||
- terraform fmt -check=true -recursive
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
terraform_validate:
|
||||
<<: *terraform_base
|
||||
<<: *terraform_cache
|
||||
stage: validate
|
||||
script:
|
||||
- cd environments/dev
|
||||
- terraform validate
|
||||
- cd ../staging
|
||||
- terraform validate
|
||||
- cd ../production
|
||||
- terraform validate
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
ansible_syntax_check:
|
||||
<<: *ansible_base
|
||||
<<: *ansible_cache
|
||||
stage: validate
|
||||
script:
|
||||
- ansible-playbook --syntax-check playbooks/site.yml
|
||||
- ansible-playbook --syntax-check playbooks/gex44-setup.yml
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
ansible_lint:
|
||||
<<: *ansible_base
|
||||
<<: *ansible_cache
|
||||
stage: validate
|
||||
script:
|
||||
- ansible-lint playbooks/ || true # Non-blocking for now
|
||||
allow_failure: true
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
yaml_lint:
|
||||
image: python:$PYTHON_VERSION-slim
|
||||
stage: validate
|
||||
before_script:
|
||||
- pip install yamllint
|
||||
script:
|
||||
- yamllint .gitlab-ci.yml
|
||||
- yamllint ansible/
|
||||
- yamllint monitoring/
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
# ================================
|
||||
# TEST STAGE
|
||||
# ================================
|
||||
|
||||
terraform_test:
|
||||
image: golang:$GO_VERSION
|
||||
stage: test
|
||||
before_script:
|
||||
- cd tests/terraform
|
||||
- go mod download
|
||||
script:
|
||||
- go test -v -timeout 30m ./...
|
||||
artifacts:
|
||||
reports:
|
||||
junit: tests/terraform/test-results.xml
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
|
||||
ansible_molecule_test:
|
||||
<<: *docker_base
|
||||
<<: *ansible_cache
|
||||
stage: test
|
||||
before_script:
|
||||
- apk add --no-cache python3 py3-pip
|
||||
- pip3 install ansible molecule[docker] docker
|
||||
- cd $ANSIBLE_ROOT
|
||||
script:
|
||||
- cd roles/vllm && molecule test
|
||||
- cd ../cuda && molecule test
|
||||
artifacts:
|
||||
reports:
|
||||
junit: ansible/molecule/test-results.xml
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
python_unit_tests:
|
||||
image: python:$PYTHON_VERSION
|
||||
stage: test
|
||||
before_script:
|
||||
- pip install -r tests/requirements.txt
|
||||
script:
|
||||
- python -m pytest tests/unit/ -v --junitxml=test-results.xml
|
||||
artifacts:
|
||||
reports:
|
||||
junit: test-results.xml
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
|
||||
# ================================
|
||||
# SECURITY STAGE
|
||||
# ================================
|
||||
|
||||
terraform_security_scan:
|
||||
image: bridgecrew/checkov:latest
|
||||
stage: security
|
||||
script:
|
||||
- checkov -d terraform/ --framework terraform --output junitxml --output-file-path checkov-results.xml
|
||||
artifacts:
|
||||
reports:
|
||||
junit: checkov-results.xml
|
||||
allow_failure: true
|
||||
rules:
|
||||
- if: $SECURITY_SCAN_ENABLED == "true"
|
||||
|
||||
ansible_security_scan:
|
||||
image: quay.io/ansible/ansible-lint:latest
|
||||
stage: security
|
||||
script:
|
||||
- ansible-lint ansible/playbooks/ --format sarif --output ansible-security.sarif
|
||||
artifacts:
|
||||
reports:
|
||||
sast: ansible-security.sarif
|
||||
allow_failure: true
|
||||
rules:
|
||||
- if: $SECURITY_SCAN_ENABLED == "true"
|
||||
|
||||
secret_detection:
|
||||
image: gitguardian/ggshield:latest
|
||||
stage: security
|
||||
script:
|
||||
- ggshield secret scan path .
|
||||
allow_failure: true
|
||||
rules:
|
||||
- if: $SECURITY_SCAN_ENABLED == "true"
|
||||
|
||||
# ================================
|
||||
# STAGING DEPLOYMENT
|
||||
# ================================
|
||||
|
||||
deploy_staging_infrastructure:
|
||||
<<: *terraform_base
|
||||
<<: *terraform_cache
|
||||
stage: deploy-staging
|
||||
environment:
|
||||
name: staging
|
||||
url: https://api-staging.${CI_PROJECT_NAME}.com
|
||||
deployment_tier: staging
|
||||
script:
|
||||
- cd environments/staging
|
||||
- terraform plan -out=staging.tfplan
|
||||
- terraform apply -auto-approve staging.tfplan
|
||||
artifacts:
|
||||
name: staging-infrastructure
|
||||
paths:
|
||||
- $TF_ROOT/environments/staging/staging.tfplan
|
||||
expire_in: 1 week
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
timeout: 30m
|
||||
|
||||
configure_staging_servers:
|
||||
<<: *ansible_base
|
||||
<<: *ansible_cache
|
||||
stage: deploy-staging
|
||||
environment:
|
||||
name: staging
|
||||
needs: ["deploy_staging_infrastructure"]
|
||||
script:
|
||||
- ansible-playbook -i inventory/staging.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass
|
||||
artifacts:
|
||||
name: staging-configuration
|
||||
paths:
|
||||
- $ANSIBLE_ROOT/logs/
|
||||
expire_in: 1 week
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
timeout: 45m
|
||||
|
||||
# ================================
|
||||
# INTEGRATION TESTS
|
||||
# ================================
|
||||
|
||||
api_contract_tests:
|
||||
image: python:$PYTHON_VERSION
|
||||
stage: integration-test
|
||||
needs: ["configure_staging_servers"]
|
||||
before_script:
|
||||
- pip install -r tests/contracts/requirements.txt
|
||||
script:
|
||||
- python tests/contracts/test_inference_api.py --api-url="$STAGING_API_URL"
|
||||
artifacts:
|
||||
reports:
|
||||
junit: tests/contracts/test-results.xml
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
load_test:
|
||||
image: grafana/k6:latest
|
||||
stage: integration-test
|
||||
needs: ["configure_staging_servers"]
|
||||
script:
|
||||
- k6 run tests/load/k6_inference_test.js --env API_URL="$STAGING_API_URL"
|
||||
artifacts:
|
||||
reports:
|
||||
performance: tests/load/k6-report.json
|
||||
rules:
|
||||
- if: $LOAD_TEST_ENABLED == "true" && $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
end_to_end_test:
|
||||
image: python:$PYTHON_VERSION
|
||||
stage: integration-test
|
||||
needs: ["configure_staging_servers"]
|
||||
before_script:
|
||||
- pip install requests pytest
|
||||
script:
|
||||
- python tests/integration/e2e_test.py --staging-url="$STAGING_API_URL"
|
||||
artifacts:
|
||||
reports:
|
||||
junit: tests/integration/e2e-results.xml
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
|
||||
# ================================
|
||||
# PRODUCTION DEPLOYMENT
|
||||
# ================================
|
||||
|
||||
deploy_production_infrastructure:
|
||||
<<: *terraform_base
|
||||
<<: *terraform_cache
|
||||
stage: deploy-production
|
||||
environment:
|
||||
name: production
|
||||
url: https://api.${CI_PROJECT_NAME}.com
|
||||
deployment_tier: production
|
||||
script:
|
||||
- cd environments/production
|
||||
- terraform plan -out=production.tfplan
|
||||
- terraform apply -auto-approve production.tfplan
|
||||
artifacts:
|
||||
name: production-infrastructure
|
||||
paths:
|
||||
- $TF_ROOT/environments/production/production.tfplan
|
||||
expire_in: 1 month
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: manual
|
||||
allow_failure: false
|
||||
timeout: 30m
|
||||
|
||||
configure_production_servers:
|
||||
<<: *ansible_base
|
||||
<<: *ansible_cache
|
||||
stage: deploy-production
|
||||
environment:
|
||||
name: production
|
||||
needs: ["deploy_production_infrastructure"]
|
||||
script:
|
||||
- ansible-playbook -i inventory/production.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass
|
||||
artifacts:
|
||||
name: production-configuration
|
||||
paths:
|
||||
- $ANSIBLE_ROOT/logs/
|
||||
expire_in: 1 month
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: manual
|
||||
timeout: 45m
|
||||
|
||||
# ================================
|
||||
# POST-DEPLOYMENT
|
||||
# ================================
|
||||
|
||||
production_smoke_tests:
|
||||
image: curlimages/curl:latest
|
||||
stage: post-deploy
|
||||
needs: ["configure_production_servers"]
|
||||
script:
|
||||
- |
|
||||
echo "Running smoke tests against production..."
|
||||
|
||||
# Health check
|
||||
curl -f "$PRODUCTION_API_URL/health" || exit 1
|
||||
echo "✓ Health check passed"
|
||||
|
||||
# Models endpoint
|
||||
curl -f "$PRODUCTION_API_URL/v1/models" || exit 1
|
||||
echo "✓ Models endpoint accessible"
|
||||
|
||||
# Metrics endpoint (internal)
|
||||
curl -f "$PRODUCTION_API_URL/metrics" || exit 1
|
||||
echo "✓ Metrics endpoint accessible"
|
||||
|
||||
# Monitoring dashboard
|
||||
curl -f "$PRODUCTION_MONITORING_URL" || exit 1
|
||||
echo "✓ Monitoring dashboard accessible"
|
||||
|
||||
echo "All smoke tests passed!"
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: manual
|
||||
|
||||
performance_baseline:
|
||||
image: grafana/k6:latest
|
||||
stage: post-deploy
|
||||
needs: ["configure_production_servers"]
|
||||
script:
|
||||
- k6 run tests/load/baseline_test.js --env API_URL="$PRODUCTION_API_URL"
|
||||
artifacts:
|
||||
reports:
|
||||
performance: tests/load/baseline-report.json
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: manual
|
||||
|
||||
cost_analysis:
|
||||
image: python:$PYTHON_VERSION
|
||||
stage: post-deploy
|
||||
before_script:
|
||||
- pip install hcloud python-dateutil jinja2
|
||||
script:
|
||||
- python scripts/cost-analysis.py --environment=production --format=json > cost-report.json
|
||||
- python scripts/cost-analysis.py --environment=production --format=markdown > cost-report.md
|
||||
artifacts:
|
||||
name: cost-analysis-$CI_COMMIT_SHORT_SHA
|
||||
paths:
|
||||
- cost-report.json
|
||||
- cost-report.md
|
||||
expire_in: 1 month
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: manual
|
||||
|
||||
# ================================
|
||||
# CLEANUP AND UTILITIES
|
||||
# ================================
|
||||
|
||||
destroy_staging:
|
||||
<<: *terraform_base
|
||||
stage: deploy-staging
|
||||
environment:
|
||||
name: staging
|
||||
action: stop
|
||||
script:
|
||||
- cd environments/staging
|
||||
- terraform destroy -auto-approve
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "web"
|
||||
when: manual
|
||||
- if: $CI_COMMIT_BRANCH != "main"
|
||||
when: manual
|
||||
|
||||
# ================================
|
||||
# SCHEDULED JOBS
|
||||
# ================================
|
||||
|
||||
nightly_full_test:
|
||||
extends: .terraform_test
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "nightly"
|
||||
parallel:
|
||||
matrix:
|
||||
- ENVIRONMENT: [staging, production]
|
||||
|
||||
weekly_security_scan:
|
||||
extends: terraform_security_scan
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "weekly"
|
||||
|
||||
# ================================
|
||||
# DEPLOYMENT NOTIFICATIONS
|
||||
# ================================
|
||||
|
||||
notify_deployment_success:
|
||||
image: curlimages/curl:latest
|
||||
stage: post-deploy
|
||||
needs: ["production_smoke_tests"]
|
||||
script:
|
||||
- |
|
||||
if [ -n "$SLACK_WEBHOOK_URL" ]; then
|
||||
curl -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"text\":\"🚀 Production deployment successful for commit $CI_COMMIT_SHORT_SHA\"}" \
|
||||
"$SLACK_WEBHOOK_URL"
|
||||
fi
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: on_success
|
||||
|
||||
notify_deployment_failure:
|
||||
image: curlimages/curl:latest
|
||||
stage: post-deploy
|
||||
script:
|
||||
- |
|
||||
if [ -n "$SLACK_WEBHOOK_URL" ]; then
|
||||
curl -X POST -H 'Content-type: application/json' \
|
||||
--data "{\"text\":\"❌ Production deployment failed for commit $CI_COMMIT_SHORT_SHA. Check pipeline: $CI_PIPELINE_URL\"}" \
|
||||
"$SLACK_WEBHOOK_URL"
|
||||
fi
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: on_failure
|
||||
250
Makefile
Normal file
250
Makefile
Normal file
@ -0,0 +1,250 @@
|
||||
.PHONY: help setup test deploy-dev deploy-prod destroy cost-report scale-up scale-down
|
||||
|
||||
# Default target
|
||||
help: ## Show this help message
|
||||
@echo "AI Infrastructure Management Commands"
|
||||
@echo "===================================="
|
||||
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||
|
||||
# Environment detection
|
||||
ENV ?= dev
|
||||
TF_DIR = terraform/environments/$(ENV)
|
||||
ANSIBLE_DIR = ansible
|
||||
|
||||
# Setup and dependencies
|
||||
setup: ## Install all dependencies and tools
|
||||
@echo "🔧 Installing dependencies..."
|
||||
@command -v terraform >/dev/null 2>&1 || (echo "❌ Terraform not found. Install from https://terraform.io" && exit 1)
|
||||
@command -v ansible >/dev/null 2>&1 || (echo "❌ Ansible not found. Install with: pip install ansible" && exit 1)
|
||||
@command -v go >/dev/null 2>&1 || (echo "❌ Go not found (needed for tests). Install from https://golang.org" && exit 1)
|
||||
@command -v k6 >/dev/null 2>&1 || (echo "❌ K6 not found. Install from https://k6.io" && exit 1)
|
||||
@echo "✅ Installing Ansible dependencies..."
|
||||
cd $(ANSIBLE_DIR) && ansible-galaxy install -r requirements.yml
|
||||
@echo "✅ Installing Go test dependencies..."
|
||||
cd tests/terraform && go mod download
|
||||
@echo "✅ Setup complete!"
|
||||
|
||||
# Validation and linting
|
||||
validate: ## Validate all configurations
|
||||
@echo "🔍 Validating Terraform configurations..."
|
||||
@for env in dev staging production; do \
|
||||
echo "Validating $$env environment..."; \
|
||||
cd terraform/environments/$$env && terraform init -backend=false && terraform validate && cd ../../../; \
|
||||
done
|
||||
@echo "🔍 Validating Ansible playbooks..."
|
||||
cd $(ANSIBLE_DIR) && ansible-playbook --syntax-check playbooks/site.yml
|
||||
cd $(ANSIBLE_DIR) && ansible-lint playbooks/
|
||||
@echo "✅ All configurations valid!"
|
||||
|
||||
# Testing
|
||||
test: validate ## Run all tests
|
||||
@echo "🧪 Running infrastructure tests..."
|
||||
cd tests/terraform && go test -v ./...
|
||||
@echo "🧪 Running Ansible tests..."
|
||||
cd $(ANSIBLE_DIR)/roles/vllm && molecule test
|
||||
@echo "🧪 Running contract tests..."
|
||||
python tests/contracts/test_inference_api.py
|
||||
@echo "✅ All tests passed!"
|
||||
|
||||
test-load: ## Run load tests against deployed infrastructure
|
||||
@echo "📊 Running load tests..."
|
||||
@if [ -z "$(API_URL)" ]; then \
|
||||
echo "❌ API_URL environment variable required"; \
|
||||
echo "Usage: make test-load API_URL=https://api.yourcompany.com"; \
|
||||
exit 1; \
|
||||
fi
|
||||
API_URL=$(API_URL) k6 run tests/load/k6_inference_test.js
|
||||
|
||||
# Infrastructure deployment
|
||||
plan: ## Plan infrastructure changes
|
||||
@echo "📋 Planning $(ENV) infrastructure..."
|
||||
cd $(TF_DIR) && terraform init && terraform plan -out=$(ENV).tfplan
|
||||
|
||||
deploy-infra: ## Deploy infrastructure only
|
||||
@echo "🚀 Deploying $(ENV) infrastructure..."
|
||||
cd $(TF_DIR) && terraform apply $(ENV).tfplan
|
||||
@echo "✅ Infrastructure deployed!"
|
||||
|
||||
configure-servers: ## Configure servers with Ansible
|
||||
@echo "⚙️ Configuring servers..."
|
||||
cd $(ANSIBLE_DIR) && ansible-playbook -i inventory/$(ENV).yml playbooks/site.yml
|
||||
@echo "✅ Servers configured!"
|
||||
|
||||
deploy-dev: plan ## Deploy development environment
|
||||
@$(MAKE) deploy-infra ENV=dev
|
||||
@$(MAKE) configure-servers ENV=dev
|
||||
@echo "🎉 Development environment ready!"
|
||||
|
||||
deploy-staging: plan ## Deploy staging environment
|
||||
@$(MAKE) deploy-infra ENV=staging
|
||||
@$(MAKE) configure-servers ENV=staging
|
||||
@echo "🎉 Staging environment ready!"
|
||||
|
||||
deploy-prod: ## Deploy production environment (requires manual approval)
|
||||
@echo "⚠️ Production deployment requires explicit confirmation"
|
||||
@echo "This will deploy to PRODUCTION environment."
|
||||
@read -p "Are you sure? [y/N] " -n 1 -r; \
|
||||
echo; \
|
||||
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
|
||||
$(MAKE) plan ENV=production; \
|
||||
$(MAKE) deploy-infra ENV=production; \
|
||||
$(MAKE) configure-servers ENV=production; \
|
||||
echo "🎉 Production environment ready!"; \
|
||||
else \
|
||||
echo "❌ Production deployment cancelled"; \
|
||||
fi
|
||||
|
||||
# Scaling operations
|
||||
scale-up: ## Add one GPU server
|
||||
@echo "📈 Scaling up GPU servers..."
|
||||
python scripts/autoscaler.py --action=scale-up --count=1
|
||||
@echo "✅ Scale up initiated!"
|
||||
|
||||
scale-down: ## Remove one GPU server
|
||||
@echo "📉 Scaling down GPU servers..."
|
||||
python scripts/autoscaler.py --action=scale-down --count=1
|
||||
@echo "✅ Scale down initiated!"
|
||||
|
||||
# Monitoring and reporting
|
||||
cost-report: ## Generate cost analysis report
|
||||
@echo "💰 Generating cost report..."
|
||||
python scripts/cost-analysis.py --format=markdown > reports/cost-report-$(shell date +%Y%m%d).md
|
||||
python scripts/cost-analysis.py --format=json > reports/cost-report-$(shell date +%Y%m%d).json
|
||||
@echo "✅ Cost report generated in reports/"
|
||||
|
||||
metrics: ## Show current infrastructure metrics
|
||||
@echo "📊 Current Infrastructure Metrics"
|
||||
@echo "=================================="
|
||||
@python scripts/decision-metrics.py --summary
|
||||
|
||||
status: ## Show infrastructure status
|
||||
@echo "🔍 Infrastructure Status"
|
||||
@echo "======================="
|
||||
@cd $(TF_DIR) && terraform show -json | jq -r '.values.root_module.resources[] | select(.type | contains("hcloud")) | "\(.type): \(.values.name) - \(.values.status // "unknown")"'
|
||||
@echo ""
|
||||
@echo "🖥️ Server Health"
|
||||
@echo "==============="
|
||||
@cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m ping --one-line
|
||||
|
||||
# Backup and recovery
|
||||
backup: ## Create infrastructure backup
|
||||
@echo "💾 Creating infrastructure backup..."
|
||||
mkdir -p backups/$(shell date +%Y%m%d)
|
||||
cd $(TF_DIR) && terraform state pull > ../../../backups/$(shell date +%Y%m%d)/terraform-state-$(ENV).json
|
||||
cd $(ANSIBLE_DIR) && tar czf ../backups/$(shell date +%Y%m%d)/ansible-inventory-$(ENV).tar.gz inventory/
|
||||
@echo "✅ Backup created in backups/$(shell date +%Y%m%d)/"
|
||||
|
||||
restore: ## Restore infrastructure from backup
|
||||
@echo "⚠️ This will restore infrastructure from backup"
|
||||
@if [ -z "$(BACKUP_DATE)" ]; then \
|
||||
echo "❌ BACKUP_DATE required"; \
|
||||
echo "Usage: make restore BACKUP_DATE=20241201"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@if [ ! -d "backups/$(BACKUP_DATE)" ]; then \
|
||||
echo "❌ Backup directory backups/$(BACKUP_DATE) not found"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@read -p "Restore from backup $(BACKUP_DATE)? [y/N] " -n 1 -r; \
|
||||
echo; \
|
||||
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
|
||||
cd $(TF_DIR) && terraform state push ../../../backups/$(BACKUP_DATE)/terraform-state-$(ENV).json; \
|
||||
echo "✅ State restored from backup"; \
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
destroy: ## Destroy infrastructure (requires confirmation)
|
||||
@echo "💥 This will DESTROY the $(ENV) infrastructure!"
|
||||
@echo "All servers, data, and configurations will be permanently deleted."
|
||||
@read -p "Type '$(ENV)-destroy-confirm' to proceed: " -r; \
|
||||
if [[ "$$REPLY" == "$(ENV)-destroy-confirm" ]]; then \
|
||||
cd $(TF_DIR) && terraform destroy; \
|
||||
echo "💥 Infrastructure destroyed!"; \
|
||||
else \
|
||||
echo "❌ Destruction cancelled (incorrect confirmation)"; \
|
||||
fi
|
||||
|
||||
clean: ## Clean temporary files and caches
|
||||
@echo "🧹 Cleaning temporary files..."
|
||||
find . -name "*.tfplan" -delete
|
||||
find . -name ".terraform" -type d -exec rm -rf {} +
|
||||
find . -name "*.pyc" -delete
|
||||
find . -name "__pycache__" -type d -exec rm -rf {} +
|
||||
@echo "✅ Cleanup complete!"
|
||||
|
||||
# Development helpers
|
||||
dev-logs: ## Show logs from development environment
|
||||
@echo "📋 Development Environment Logs"
|
||||
@echo "=============================="
|
||||
cd $(ANSIBLE_DIR) && ansible gex44 -i inventory/dev.yml -m shell -a "journalctl -u vllm-api -n 50 --no-pager"
|
||||
|
||||
dev-ssh: ## SSH to development GPU server
|
||||
@echo "🔌 Connecting to development GPU server..."
|
||||
@SERVER_IP=$$(cd $(TF_DIR) && terraform output -json | jq -r '.gex44_ips.value[0]'); \
|
||||
ssh -i ~/.ssh/hetzner_key ubuntu@$$SERVER_IP
|
||||
|
||||
logs: ## Show logs from specified environment
|
||||
@if [ -z "$(SERVICE)" ]; then \
|
||||
echo "📋 Available services: vllm-api, haproxy, prometheus, grafana"; \
|
||||
echo "Usage: make logs SERVICE=vllm-api ENV=production"; \
|
||||
exit 1; \
|
||||
fi
|
||||
cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m shell -a "journalctl -u $(SERVICE) -n 50 --no-pager"
|
||||
|
||||
# Documentation
|
||||
docs: ## Generate documentation
|
||||
@echo "📚 Generating documentation..."
|
||||
@command -v mkdocs >/dev/null 2>&1 || pip install mkdocs
|
||||
mkdocs build
|
||||
@echo "✅ Documentation generated in site/"
|
||||
|
||||
docs-serve: ## Serve documentation locally
|
||||
@echo "📖 Serving documentation at http://localhost:8000"
|
||||
mkdocs serve
|
||||
|
||||
# CI/CD helpers
|
||||
ci-validate: ## Validation for CI pipeline
|
||||
@$(MAKE) validate
|
||||
@$(MAKE) test
|
||||
|
||||
ci-deploy-staging: ## Deploy staging (for CI)
|
||||
@$(MAKE) deploy-staging
|
||||
|
||||
ci-deploy-production: ## Deploy production (for CI)
|
||||
@$(MAKE) deploy-prod
|
||||
|
||||
# Quick operations
|
||||
quick-status: ## Quick infrastructure overview
|
||||
@echo "⚡ Quick Status Overview"
|
||||
@echo "======================"
|
||||
@echo "Environment: $(ENV)"
|
||||
@echo "Terraform state: $$(cd $(TF_DIR) && terraform show -json 2>/dev/null | jq -r '.values.root_module.resources | length // "No resources"') resources"
|
||||
@python -c "import requests; print('API Health:', 'OK' if requests.get('$(API_URL)/health', timeout=5).status_code == 200 else 'FAIL')" 2>/dev/null || echo "API Health: Unknown (set API_URL)"
|
||||
@echo "Last backup: $$(ls -1t backups/ | head -1 || echo 'No backups')"
|
||||
|
||||
emergency-scale: ## Emergency scale up (bypasses normal limits)
|
||||
@echo "🚨 EMERGENCY SCALE UP"
|
||||
@echo "This will immediately order new GPU servers"
|
||||
@read -p "Number of servers to add [1-5]: " -n 1 -r; \
|
||||
echo; \
|
||||
if [[ $$REPLY =~ ^[1-5]$$ ]]; then \
|
||||
python scripts/autoscaler.py --action=emergency-scale --count=$$REPLY; \
|
||||
echo "🚨 Emergency scale initiated for $$REPLY servers"; \
|
||||
else \
|
||||
echo "❌ Invalid server count"; \
|
||||
fi
|
||||
|
||||
# Environment info
|
||||
env-info: ## Show environment configuration
|
||||
@echo "🔍 Environment Information"
|
||||
@echo "========================="
|
||||
@echo "Current Environment: $(ENV)"
|
||||
@echo "Terraform Directory: $(TF_DIR)"
|
||||
@echo "Ansible Directory: $(ANSIBLE_DIR)"
|
||||
@echo ""
|
||||
@echo "Required Environment Variables:"
|
||||
@echo "------------------------------"
|
||||
@echo "HCLOUD_TOKEN: $$([ -n "$$HCLOUD_TOKEN" ] && echo "✅ Set" || echo "❌ Missing")"
|
||||
@echo "ROBOT_API_USER: $$([ -n "$$ROBOT_API_USER" ] && echo "✅ Set" || echo "❌ Missing")"
|
||||
@echo "ROBOT_API_PASSWORD: $$([ -n "$$ROBOT_API_PASSWORD" ] && echo "✅ Set" || echo "❌ Missing")"
|
||||
@echo "API_URL: $$([ -n "$$API_URL" ] && echo "✅ Set ($$API_URL)" || echo "❌ Missing")"
|
||||
322
README.md
Normal file
322
README.md
Normal file
@ -0,0 +1,322 @@
|
||||
# Infrastructure IA Production-Ready avec Hetzner
|
||||
|
||||
> 🚀 Stack complète pour déployer une infrastructure IA/ML sur Hetzner avec GitLab CI/CD et Ansible
|
||||
|
||||
[](https://img.shields.io/badge/tests-95%25-brightgreen)
|
||||
[](docs/COSTS.md)
|
||||
[](https://monitoring.yourcompany.com)
|
||||
|
||||
## 🎯 Objectif
|
||||
|
||||
Cette repository fournit une infrastructure **production-ready** pour déployer des modèles IA sur serveurs Hetzner GEX44 (RTX 4000 Ada), avec auto-scaling, monitoring GPU, et coûts optimisés.
|
||||
|
||||
**ROI prouvé** : 12x moins cher qu'AWS, 99.94% uptime, P95 latency < 2s.
|
||||
|
||||
## 🏗️ Architecture
|
||||
|
||||
```
|
||||
Internet → HAProxy (Hetzner Cloud) → GEX44 GPU Servers → vLLM APIs
|
||||
↓
|
||||
Monitoring Stack (Prometheus/Grafana)
|
||||
```
|
||||
|
||||
- **3x GEX44** (RTX 4000 Ada, 20GB VRAM) : 552€/mois vs 9720€ AWS equivalent
|
||||
- **Auto-scaling** basé sur métriques GPU réelles
|
||||
- **Zero-downtime deployments** avec Ansible-pull
|
||||
- **Tests automatisés** (Terratest, Molecule, K6, Pact)
|
||||
|
||||
## ⚡ Quick Start (5 minutes)
|
||||
|
||||
```bash
|
||||
# 1. Clone et setup
|
||||
git clone https://github.com/spham/hetzner-ai-infrastructure.git
|
||||
cd ai-infrastructure
|
||||
make setup
|
||||
|
||||
# 2. Configure secrets
|
||||
cp .env.example .env
|
||||
# Éditer .env avec vos tokens Hetzner
|
||||
|
||||
# 3. Deploy development
|
||||
make deploy-dev
|
||||
|
||||
# 4. Vérifier deployment
|
||||
make test
|
||||
```
|
||||
|
||||
**Prérequis** :
|
||||
- Compte Hetzner (Robot + Cloud)
|
||||
- GitLab account pour CI/CD
|
||||
- 3x serveurs GEX44 commandés
|
||||
|
||||
## 📋 Commandes Principales
|
||||
|
||||
| Commande | Description |
|
||||
|----------|-------------|
|
||||
| `make setup` | Installation dépendances locales |
|
||||
| `make test` | Lance tous les tests |
|
||||
| `make deploy-dev` | Déploie environnement dev |
|
||||
| `make deploy-prod` | Déploie environnement production |
|
||||
| `make destroy` | Détruit infrastructure |
|
||||
| `make cost-report` | Génère rapport de coûts |
|
||||
| `make scale-up` | Ajoute un serveur GPU |
|
||||
| `make scale-down` | Retire un serveur GPU |
|
||||
|
||||
## 🛠️ Stack Technique
|
||||
|
||||
### Infrastructure
|
||||
- **Hetzner Cloud** : Load balancer, API Gateway, Monitoring
|
||||
- **Hetzner Robot** : Serveurs dédiés GEX44 (GPU)
|
||||
- **Terraform** : Infrastructure as Code modulaire
|
||||
- **Ansible** : Configuration management (ansible-pull)
|
||||
|
||||
### GPU & IA
|
||||
- **CUDA 12.3** : Driver GPU optimisé
|
||||
- **vLLM 0.3.0+** : Inférence haute performance
|
||||
- **Modèles supportés** : Mixtral-8x7B, Llama2-70B, CodeLlama-34B
|
||||
- **Auto-scaling** : Basé sur utilisation GPU
|
||||
|
||||
### Observabilité
|
||||
- **Prometheus** : Métriques GPU + Business
|
||||
- **Grafana** : Dashboards coût/performance
|
||||
- **AlertManager** : Alertes intelligentes
|
||||
- **nvidia-smi-exporter** : Métriques GPU détaillées
|
||||
|
||||
### CI/CD & Tests
|
||||
- **GitLab CI** : Pipeline multi-stage avec tests
|
||||
- **Terratest** : Tests infrastructure (Go)
|
||||
- **Molecule** : Tests Ansible
|
||||
- **K6** : Tests de charge
|
||||
- **Pact** : Tests de contrat API
|
||||
|
||||
## 📊 Coûts Réels
|
||||
|
||||
| Provider | GPU Servers | Cloud Services | Total/mois | vs Hetzner |
|
||||
|----------|-------------|----------------|------------|------------|
|
||||
| **Hetzner** | 552€ | 139€ | **691€** | Baseline |
|
||||
| AWS | 9720€ | 850€ | 10570€ | +1430% |
|
||||
| Azure | 7926€ | 780€ | 8706€ | +1160% |
|
||||
|
||||
**Performance/€** :
|
||||
- Hetzner : 255 tokens/sec pour 691€
|
||||
- AWS : 360 tokens/sec pour 10570€
|
||||
- **ROI Hetzner** : 2.7x plus efficace
|
||||
|
||||
## 🚀 Déploiement Production
|
||||
|
||||
### 1. Configuration Initiale
|
||||
```bash
|
||||
# Variables d'environnement
|
||||
export HCLOUD_TOKEN="your-hcloud-token"
|
||||
export ROBOT_API_USER="your-robot-user"
|
||||
export ROBOT_API_PASSWORD="your-robot-password"
|
||||
|
||||
# Setup Terraform backend
|
||||
cd terraform/environments/production
|
||||
terraform init -backend-config="bucket=your-terraform-state"
|
||||
```
|
||||
|
||||
### 2. Déploiement Infrastructure
|
||||
```bash
|
||||
# Plan et apply
|
||||
terraform plan -out=prod.tfplan
|
||||
terraform apply prod.tfplan
|
||||
|
||||
# Configuration serveurs GPU
|
||||
cd ../../../ansible
|
||||
ansible-playbook -i inventory/production.yml playbooks/site.yml
|
||||
```
|
||||
|
||||
### 3. Validation
|
||||
```bash
|
||||
# Tests smoke
|
||||
curl https://api.yourcompany.com/health
|
||||
curl https://api.yourcompany.com/v1/models
|
||||
|
||||
# Tests de charge
|
||||
k6 run tests/load/k6_inference_test.js
|
||||
|
||||
# Monitoring
|
||||
open https://monitoring.yourcompany.com
|
||||
```
|
||||
|
||||
## 📈 Monitoring
|
||||
|
||||
### Dashboards Disponibles
|
||||
- **GPU Performance** : Utilisation, température, mémoire
|
||||
- **Inference Metrics** : Latence, throughput, erreurs
|
||||
- **Cost Tracking** : Coût par requête, ROI temps réel
|
||||
- **Infrastructure Health** : Uptime, réseau, storage
|
||||
|
||||
### Alertes Configurées
|
||||
- GPU utilisation > 90% pendant 10min
|
||||
- Latence P95 > 2 secondes
|
||||
- Taux d'erreur > 5%
|
||||
- GPU température > 85°C
|
||||
- Serveur GPU inutilisé > 30min (coût)
|
||||
|
||||
## 🔧 Configuration
|
||||
|
||||
### Variables d'Environnement
|
||||
```bash
|
||||
# Hetzner APIs
|
||||
HCLOUD_TOKEN=xxx
|
||||
ROBOT_API_USER=xxx
|
||||
ROBOT_API_PASSWORD=xxx
|
||||
|
||||
# Auto-scaling
|
||||
MIN_GEX44_COUNT=1
|
||||
MAX_GEX44_COUNT=5
|
||||
SCALE_UP_THRESHOLD=0.8 # 80% GPU utilization
|
||||
SCALE_DOWN_THRESHOLD=0.3 # 30% GPU utilization
|
||||
|
||||
# Monitoring
|
||||
PROMETHEUS_URL=http://monitoring.internal:9090
|
||||
GRAFANA_ADMIN_PASSWORD=xxx
|
||||
ALERT_EMAIL=alerts@yourcompany.com
|
||||
```
|
||||
|
||||
### Personnalisation Modèles
|
||||
```yaml
|
||||
# ansible/group_vars/gex44/main.yml
|
||||
vllm_models:
|
||||
- name: "mixtral-8x7b"
|
||||
repo: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tensor_parallel_size: 1
|
||||
max_model_len: 4096
|
||||
|
||||
- name: "llama2-70b"
|
||||
repo: "meta-llama/Llama-2-70b-chat-hf"
|
||||
tensor_parallel_size: 4 # Multi-GPU
|
||||
max_model_len: 2048
|
||||
```
|
||||
|
||||
## 🧪 Tests
|
||||
|
||||
### Test Complet
|
||||
```bash
|
||||
make test
|
||||
```
|
||||
|
||||
### Tests Spécifiques
|
||||
```bash
|
||||
# Infrastructure
|
||||
cd tests/terraform && go test -v
|
||||
|
||||
# Configuration
|
||||
cd ansible && molecule test
|
||||
|
||||
# API Contracts
|
||||
python tests/contracts/test_inference_api.py
|
||||
|
||||
# Load Testing
|
||||
k6 run tests/load/k6_inference_test.js
|
||||
```
|
||||
|
||||
## 🔒 Sécurité
|
||||
|
||||
### Secrets Management
|
||||
- **GitLab Variables** : Tokens API (masked/protected)
|
||||
- **Ansible Vault** : Configuration sensible chiffrée
|
||||
- **Let's Encrypt** : Certificats SSL automatiques
|
||||
- **Firewall Rules** : Accès limité par IP/port
|
||||
|
||||
### Hardening
|
||||
- Serveurs GPU sans accès SSH public
|
||||
- Communication chiffrée (TLS 1.3)
|
||||
- Rotation automatique des secrets
|
||||
- Audit logs centralisés
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
- [**Architecture**](docs/ARCHITECTURE.md) : Diagrammes et décisions
|
||||
- [**Deployment**](docs/DEPLOYMENT.md) : Guide étape par étape
|
||||
- [**Troubleshooting**](docs/TROUBLESHOOTING.md) : Solutions aux problèmes courants
|
||||
- [**Scaling**](docs/SCALING.md) : Quand et comment scaler
|
||||
- [**Costs**](docs/COSTS.md) : Analyse détaillée des coûts
|
||||
|
||||
## 🤝 Support
|
||||
|
||||
### Issues Communes
|
||||
1. **GPU pas détectée** → [Solution](docs/TROUBLESHOOTING.md#gpu-detection)
|
||||
2. **Latence élevée** → [Optimisation](docs/TROUBLESHOOTING.md#latency-optimization)
|
||||
3. **Out of memory** → [Configuration](docs/TROUBLESHOOTING.md#memory-management)
|
||||
|
||||
### Community
|
||||
- **Discussions** : [GitHub Discussions](https://github.com/spham/hetzner-ai-infrastructure/discussions)
|
||||
- **Issues** : [Bug Reports](https://github.com/spham/hetzner-ai-infrastructure/issues)
|
||||
- **Discord** : [Join our server](https://discord.gg/your-server)
|
||||
|
||||
## 🚀 Migration
|
||||
|
||||
### Depuis AWS/Azure
|
||||
```bash
|
||||
# 1. Audit infrastructure existante
|
||||
scripts/audit-current-infrastructure.sh > migration-baseline.json
|
||||
|
||||
# 2. Migration des modèles
|
||||
scripts/migrate-models.sh --source=s3://your-bucket --target=hetzner
|
||||
|
||||
# 3. Split progressif du trafic
|
||||
scripts/traffic-split.sh --new-infra=10 # Commencer par 10%
|
||||
```
|
||||
|
||||
### Depuis Bare Metal
|
||||
```bash
|
||||
# 1. Setup monitoring parallèle
|
||||
ansible-playbook playbooks/monitoring-setup.yml
|
||||
|
||||
# 2. Migration blue/green
|
||||
make deploy-staging
|
||||
scripts/validate-parity.py --old-api=$OLD --new-api=$NEW
|
||||
make deploy-prod
|
||||
```
|
||||
|
||||
## 💰 ROI Calculator
|
||||
|
||||
```bash
|
||||
# Analyse de coût comparative
|
||||
python scripts/cost-analysis.py
|
||||
|
||||
# Métriques de décision
|
||||
python scripts/decision-metrics.py --period=30d
|
||||
|
||||
# Rapport mensuel automatique
|
||||
make cost-report
|
||||
```
|
||||
|
||||
## 📈 Roadmap
|
||||
|
||||
### v1.0 (Actuel)
|
||||
- ✅ Infrastructure Hetzner complète
|
||||
- ✅ Auto-scaling GPU
|
||||
- ✅ Monitoring production-ready
|
||||
- ✅ Tests automatisés
|
||||
|
||||
### v1.1 (Q4 2024)
|
||||
- 🔄 Multi-région (Nuremberg + Helsinki)
|
||||
- 🔄 Support Kubernetes (optionnel)
|
||||
- 🔄 Advanced cost optimization
|
||||
- 🔄 Model caching intelligent
|
||||
|
||||
### v2.0 (Q1 2025)
|
||||
- 🆕 Support H100 servers
|
||||
- 🆕 Edge deployment
|
||||
- 🆕 Fine-tuning pipeline
|
||||
- 🆕 Advanced observability
|
||||
|
||||
## 📄 License
|
||||
|
||||
MIT License - Voir [LICENSE](LICENSE) pour détails.
|
||||
|
||||
## 👥 Contributors
|
||||
|
||||
Développé avec ❤️ par l'équipe Infrastructure IA.
|
||||
|
||||
**Maintainer** : [@yourhandle](https://github.com/yourhandle)
|
||||
|
||||
---
|
||||
|
||||
⭐ **Star ce repo** si cette infrastructure vous aide !
|
||||
|
||||
📖 **Lire l'article complet** : [Infrastructure IA Production-Ready avec Hetzner](article.md)
|
||||
50
ansible/ansible.cfg
Normal file
50
ansible/ansible.cfg
Normal file
@ -0,0 +1,50 @@
|
||||
[defaults]
|
||||
# Basic configuration
|
||||
inventory = inventory/production.yml
|
||||
remote_user = ubuntu
|
||||
private_key_file = ~/.ssh/hetzner_key
|
||||
host_key_checking = False
|
||||
retry_files_enabled = False
|
||||
stdout_callback = yaml
|
||||
bin_ansible_callbacks = True
|
||||
|
||||
# Performance optimizations
|
||||
forks = 10
|
||||
gathering = smart
|
||||
fact_caching = memory
|
||||
fact_caching_timeout = 3600
|
||||
|
||||
# Logging
|
||||
log_path = /var/log/ansible.log
|
||||
display_skipped_hosts = False
|
||||
display_ok_hosts = True
|
||||
|
||||
# Security
|
||||
ansible_managed = Ansible managed: {file} modified on %Y-%m-%d %H:%M:%S by {uid} on {host}
|
||||
|
||||
[inventory]
|
||||
enable_plugins = host_list, script, auto, yaml, ini, toml
|
||||
|
||||
[ssh_connection]
|
||||
ssh_args = -C -o ControlMaster=auto -o ControlPersist=60s -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no
|
||||
pipelining = True
|
||||
control_path = /tmp/ansible-ssh-%%h-%%p-%%r
|
||||
|
||||
[persistent_connection]
|
||||
connect_timeout = 30
|
||||
command_timeout = 30
|
||||
|
||||
[colors]
|
||||
highlight = white
|
||||
verbose = blue
|
||||
warn = bright purple
|
||||
error = red
|
||||
debug = dark gray
|
||||
deprecate = purple
|
||||
skip = cyan
|
||||
unreachable = red
|
||||
ok = green
|
||||
changed = yellow
|
||||
diff_add = green
|
||||
diff_remove = red
|
||||
diff_lines = cyan
|
||||
160
ansible/group_vars/all/main.yml
Normal file
160
ansible/group_vars/all/main.yml
Normal file
@ -0,0 +1,160 @@
|
||||
# Global variables for AI Infrastructure
|
||||
|
||||
# Project information
|
||||
project_name: "ai-infrastructure"
|
||||
project_version: "1.0.0"
|
||||
managed_by: "ansible"
|
||||
|
||||
# Environment
|
||||
environment: "{{ env | default('production') }}"
|
||||
|
||||
# Network configuration
|
||||
private_network_cidr: "10.0.0.0/16"
|
||||
gex44_subnet: "10.0.1.0/24"
|
||||
cloud_subnet: "10.0.2.0/24"
|
||||
|
||||
# Security configuration
|
||||
ssh_port: 22
|
||||
allowed_ssh_users:
|
||||
- ubuntu
|
||||
- ansible
|
||||
|
||||
# System configuration
|
||||
timezone: "UTC"
|
||||
ntp_servers:
|
||||
- 0.pool.ntp.org
|
||||
- 1.pool.ntp.org
|
||||
- 2.pool.ntp.org
|
||||
- 3.pool.ntp.org
|
||||
|
||||
# Package repositories
|
||||
ubuntu_version: "22.04"
|
||||
python_version: "3.11"
|
||||
|
||||
# Docker configuration
|
||||
docker_version: "24.0"
|
||||
docker_compose_version: "2.21"
|
||||
|
||||
# Common packages
|
||||
common_packages:
|
||||
- curl
|
||||
- wget
|
||||
- htop
|
||||
- vim
|
||||
- git
|
||||
- jq
|
||||
- unzip
|
||||
- software-properties-common
|
||||
- apt-transport-https
|
||||
- ca-certificates
|
||||
- gnupg
|
||||
- lsb-release
|
||||
- build-essential
|
||||
- python3-pip
|
||||
- python3-venv
|
||||
|
||||
# Python packages
|
||||
python_packages:
|
||||
- requests
|
||||
- pyyaml
|
||||
- psutil
|
||||
- prometheus-client
|
||||
- numpy
|
||||
|
||||
# Monitoring configuration
|
||||
monitoring_enabled: true
|
||||
log_retention_days: 30
|
||||
metrics_retention_days: 30
|
||||
|
||||
# Backup configuration
|
||||
backup_enabled: true
|
||||
backup_retention_days: 7
|
||||
backup_schedule: "0 3 * * *" # Daily at 3 AM
|
||||
|
||||
# SSL/TLS configuration
|
||||
ssl_enabled: true
|
||||
ssl_certificate_path: "/etc/ssl/certs"
|
||||
ssl_private_key_path: "/etc/ssl/private"
|
||||
|
||||
# Firewall configuration (using ufw)
|
||||
firewall_enabled: true
|
||||
firewall_default_policy_incoming: "deny"
|
||||
firewall_default_policy_outgoing: "allow"
|
||||
|
||||
# Common firewall rules
|
||||
firewall_rules:
|
||||
- rule: allow
|
||||
port: "{{ ssh_port }}"
|
||||
proto: tcp
|
||||
comment: "SSH access"
|
||||
- rule: allow
|
||||
port: "{{ node_exporter_port | default(9100) }}"
|
||||
proto: tcp
|
||||
src: "{{ private_network_cidr }}"
|
||||
comment: "Node exporter from private network"
|
||||
|
||||
# Logging configuration
|
||||
rsyslog_enabled: true
|
||||
log_rotate_enabled: true
|
||||
|
||||
# Service discovery
|
||||
consul_enabled: false
|
||||
service_discovery_enabled: false
|
||||
|
||||
# Auto-updates configuration
|
||||
unattended_upgrades_enabled: true
|
||||
auto_reboot_enabled: false
|
||||
auto_reboot_time: "03:00"
|
||||
|
||||
# Performance tuning
|
||||
swappiness: 10
|
||||
vm_dirty_ratio: 15
|
||||
vm_dirty_background_ratio: 5
|
||||
|
||||
# File system tuning
|
||||
fs_file_max: 1048576
|
||||
nofile_limit: 65536
|
||||
|
||||
# Network tuning
|
||||
net_core_somaxconn: 32768
|
||||
net_core_netdev_max_backlog: 5000
|
||||
tcp_max_syn_backlog: 8192
|
||||
|
||||
# Memory tuning (for ML workloads)
|
||||
transparent_hugepage: "madvise"
|
||||
oom_kill_allocating_task: 1
|
||||
|
||||
# Git configuration for ansible-pull
|
||||
git_repo_url: "{{ ansible_repo_url }}"
|
||||
git_branch: "main"
|
||||
git_dest: "/opt/ai-infrastructure"
|
||||
ansible_pull_interval: "*/5" # Every 5 minutes
|
||||
|
||||
# Health check configuration
|
||||
health_check_enabled: true
|
||||
health_check_interval: 30 # seconds
|
||||
health_check_timeout: 10 # seconds
|
||||
health_check_retries: 3
|
||||
|
||||
# Alerting configuration
|
||||
alerting_enabled: true
|
||||
alert_email: "{{ alert_email | default('alerts@example.com') }}"
|
||||
slack_webhook_url: "{{ slack_webhook_url | default('') }}"
|
||||
|
||||
# Cost tracking
|
||||
cost_tracking_enabled: true
|
||||
cost_center: "engineering"
|
||||
billing_tags:
|
||||
Project: "{{ project_name }}"
|
||||
Environment: "{{ environment }}"
|
||||
ManagedBy: "{{ managed_by }}"
|
||||
|
||||
# Development tools (only for dev environment)
|
||||
dev_tools_enabled: "{{ environment == 'dev' }}"
|
||||
dev_packages:
|
||||
- strace
|
||||
- tcpdump
|
||||
- iotop
|
||||
- ngrep
|
||||
- tmux
|
||||
- screen
|
||||
176
ansible/group_vars/gex44/main.yml
Normal file
176
ansible/group_vars/gex44/main.yml
Normal file
@ -0,0 +1,176 @@
|
||||
# GEX44 GPU servers specific configuration
|
||||
|
||||
# Hardware specifications
|
||||
cpu_cores: 12 # Intel i5-13500
|
||||
memory_gb: 64
|
||||
storage_nvme_gb: 3840 # 2x 1.92TB NVMe
|
||||
gpu_model: "RTX 4000 Ada Generation"
|
||||
gpu_memory_gb: 20
|
||||
gpu_compute_capability: "8.9"
|
||||
|
||||
# CUDA configuration
|
||||
cuda_version: "12.3"
|
||||
cuda_toolkit_version: "12.3.2"
|
||||
cudnn_version: "8.9"
|
||||
nvidia_driver_version: "535"
|
||||
|
||||
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64"
|
||||
cuda_keyring_url: "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub"
|
||||
|
||||
# GPU monitoring
|
||||
nvidia_smi_exporter_version: "1.2.0"
|
||||
nvidia_smi_exporter_port: 9835
|
||||
gpu_metrics_interval: 5 # seconds
|
||||
|
||||
# vLLM configuration
|
||||
vllm_version: "0.3.0"
|
||||
vllm_user: "vllm"
|
||||
vllm_group: "vllm"
|
||||
vllm_home: "/opt/vllm"
|
||||
vllm_port: 8000
|
||||
vllm_host: "0.0.0.0"
|
||||
vllm_workers: 1
|
||||
vllm_log_level: "INFO"
|
||||
|
||||
# Performance tuning for GPU inference
|
||||
vllm_gpu_memory_utilization: 0.85
|
||||
vllm_max_model_len: 4096
|
||||
vllm_max_num_batched_tokens: 8192
|
||||
vllm_max_num_seqs: 256
|
||||
vllm_tensor_parallel_size: 1
|
||||
vllm_pipeline_parallel_size: 1
|
||||
vllm_block_size: 16
|
||||
vllm_swap_space: 4 # GB
|
||||
|
||||
# Model configuration
|
||||
models_base_dir: "/opt/vllm/models"
|
||||
models_cache_dir: "/opt/vllm/cache"
|
||||
huggingface_cache_dir: "/opt/vllm/hf_cache"
|
||||
|
||||
# Available models configuration
|
||||
available_models:
|
||||
mixtral-8x7b:
|
||||
repo_id: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
model_size_gb: 87
|
||||
context_length: 32768
|
||||
tensor_parallel_size: 1
|
||||
recommended_batch_size: 32
|
||||
estimated_speed_tokens_per_sec: 85
|
||||
|
||||
llama2-70b:
|
||||
repo_id: "meta-llama/Llama-2-70b-chat-hf"
|
||||
model_size_gb: 140
|
||||
context_length: 4096
|
||||
tensor_parallel_size: 4 # Requires multiple GPUs or quantization
|
||||
recommended_batch_size: 16
|
||||
estimated_speed_tokens_per_sec: 25
|
||||
quantization: "awq" # Enable AWQ quantization for single GPU
|
||||
|
||||
codellama-34b:
|
||||
repo_id: "codellama/CodeLlama-34b-Instruct-hf"
|
||||
model_size_gb: 68
|
||||
context_length: 16384
|
||||
tensor_parallel_size: 1
|
||||
recommended_batch_size: 16
|
||||
estimated_speed_tokens_per_sec: 45
|
||||
|
||||
# Default model to deploy
|
||||
default_model: "mixtral-8x7b"
|
||||
|
||||
# Model download configuration
|
||||
download_timeout: 3600 # 1 hour
|
||||
parallel_downloads: 2
|
||||
verify_checksums: true
|
||||
use_git_lfs: true
|
||||
|
||||
# Docker configuration for vLLM
|
||||
vllm_docker_image: "vllm/vllm-openai:v0.3.0"
|
||||
vllm_docker_memory: "50g"
|
||||
vllm_docker_shm_size: "8g"
|
||||
|
||||
# System optimization for GPU workloads
|
||||
# CPU governor
|
||||
cpu_governor: "performance"
|
||||
|
||||
# Memory settings
|
||||
huge_pages_enabled: true
|
||||
huge_pages_size: "2048kB"
|
||||
huge_pages_count: 1024
|
||||
|
||||
# I/O scheduler optimization
|
||||
io_scheduler: "mq-deadline" # Better for NVMe SSDs
|
||||
|
||||
# Network optimization for high-throughput inference
|
||||
tcp_congestion_control: "bbr"
|
||||
tcp_window_scaling: 1
|
||||
tcp_timestamps: 1
|
||||
tcp_sack: 1
|
||||
|
||||
# Storage optimization
|
||||
# Mount options for model storage
|
||||
models_mount_options: "noatime,nodiratime"
|
||||
|
||||
# Temp directory for model loading
|
||||
temp_dir: "/tmp/vllm"
|
||||
temp_dir_size: "10G" # tmpfs size
|
||||
|
||||
# Logging configuration
|
||||
vllm_log_dir: "/var/log/vllm"
|
||||
vllm_log_max_size: "100M"
|
||||
vllm_log_max_files: 10
|
||||
|
||||
# Health check configuration
|
||||
health_check_endpoint: "/health"
|
||||
health_check_timeout: 30
|
||||
readiness_check_endpoint: "/v1/models"
|
||||
|
||||
# Performance monitoring
|
||||
performance_monitoring_enabled: true
|
||||
gpu_metrics_collection_interval: 5
|
||||
inference_metrics_collection_interval: 10
|
||||
|
||||
# Auto-scaling triggers (used by autoscaler)
|
||||
scale_up_gpu_threshold: 80 # GPU utilization %
|
||||
scale_up_queue_threshold: 10 # Requests in queue
|
||||
scale_up_latency_threshold: 5000 # ms
|
||||
|
||||
scale_down_gpu_threshold: 30
|
||||
scale_down_duration: 1800 # 30 minutes of low usage
|
||||
|
||||
# Backup and snapshot configuration
|
||||
model_backup_enabled: false # Models are downloaded, not backed up
|
||||
config_backup_enabled: true
|
||||
logs_backup_enabled: false # Too large, use log rotation instead
|
||||
|
||||
# Security hardening
|
||||
disable_ssh_password_auth: true
|
||||
disable_root_login: true
|
||||
install_fail2ban: true
|
||||
enable_apparmor: true
|
||||
|
||||
# Firewall rules specific to GEX44
|
||||
gex44_firewall_rules:
|
||||
- rule: allow
|
||||
port: "{{ vllm_port }}"
|
||||
proto: tcp
|
||||
src: "{{ cloud_subnet }}"
|
||||
comment: "vLLM API from cloud servers"
|
||||
- rule: allow
|
||||
port: "{{ nvidia_smi_exporter_port }}"
|
||||
proto: tcp
|
||||
src: "{{ cloud_subnet }}"
|
||||
comment: "GPU metrics from monitoring"
|
||||
|
||||
# Environment variables for vLLM
|
||||
vllm_environment_vars:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
NCCL_DEBUG: "INFO"
|
||||
PYTHONPATH: "/opt/vllm"
|
||||
HF_HOME: "{{ huggingface_cache_dir }}"
|
||||
TRANSFORMERS_CACHE: "{{ huggingface_cache_dir }}/transformers"
|
||||
HF_DATASETS_CACHE: "{{ huggingface_cache_dir }}/datasets"
|
||||
|
||||
# Maintenance windows
|
||||
maintenance_window_start: "03:00"
|
||||
maintenance_window_duration: "2h"
|
||||
auto_restart_during_maintenance: false
|
||||
88
ansible/group_vars/gex44_production.yml
Normal file
88
ansible/group_vars/gex44_production.yml
Normal file
@ -0,0 +1,88 @@
|
||||
# ansible/group_vars/gex44_production.yml
|
||||
# Generated by Terraform for Production GEX44 servers
|
||||
|
||||
# System Configuration
|
||||
ubuntu_version: "24.04"
|
||||
nvidia_driver_version: "545.23.08"
|
||||
docker_version: "24.0.*"
|
||||
vllm_version: latest
|
||||
|
||||
# Model Configuration
|
||||
model_config:
|
||||
primary: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
quantization: awq
|
||||
max_context: 4096
|
||||
gpu_memory_limit: 0.95
|
||||
fallback_model: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
|
||||
# Scaling Configuration
|
||||
scaling_config:
|
||||
min_nodes: 2
|
||||
max_nodes: 5
|
||||
auto_scaling: true
|
||||
scale_up_threshold: 0.80
|
||||
scale_down_threshold: 0.30
|
||||
cooldown_period: 600
|
||||
|
||||
# vLLM Service Configuration
|
||||
vllm_service:
|
||||
port: 8000
|
||||
host: "0.0.0.0"
|
||||
tensor_parallel_size: 1
|
||||
max_model_len: 4096
|
||||
gpu_memory_utilization: 0.95
|
||||
quantization: "awq"
|
||||
trust_remote_code: false
|
||||
worker_use_ray: false
|
||||
|
||||
# Security Configuration
|
||||
firewall_rules:
|
||||
- port: 22
|
||||
protocol: tcp
|
||||
source: "{{ admin_ips }}"
|
||||
comment: "SSH access for admins"
|
||||
- port: 8000
|
||||
protocol: tcp
|
||||
source: "{{ load_balancer_ips }}"
|
||||
comment: "vLLM API access from load balancers"
|
||||
- port: 9400
|
||||
protocol: tcp
|
||||
source: "{{ monitoring_ips }}"
|
||||
comment: "Metrics export for monitoring"
|
||||
|
||||
# Monitoring Configuration
|
||||
monitoring:
|
||||
node_exporter_port: 9100
|
||||
nvidia_exporter_port: 9400
|
||||
log_level: "info"
|
||||
metrics_retention: "90d"
|
||||
|
||||
# Backup Configuration
|
||||
backup:
|
||||
enabled: true
|
||||
schedule: "0 2 * * *" # Daily at 2 AM
|
||||
retention_days: 30
|
||||
destinations:
|
||||
- type: "hetzner_storage_box"
|
||||
path: "/backups/production/gex44"
|
||||
|
||||
# MLflow Integration
|
||||
mlflow:
|
||||
tracking_uri: "https://mlflow-prod.company.com:5000"
|
||||
experiment_name: "production-mixtral"
|
||||
model_registry: true
|
||||
artifact_store: "s3://mlflow-artifacts-prod"
|
||||
|
||||
# Performance Tuning
|
||||
performance:
|
||||
cpu_governor: "performance"
|
||||
numa_balancing: false
|
||||
transparent_hugepages: "madvise"
|
||||
swappiness: 1
|
||||
|
||||
# NVIDIA Settings
|
||||
nvidia:
|
||||
persistence_mode: true
|
||||
power_limit: 300 # watts
|
||||
memory_clock_offset: 0
|
||||
graphics_clock_offset: 0
|
||||
99
ansible/group_vars/load_balancer.yml
Normal file
99
ansible/group_vars/load_balancer.yml
Normal file
@ -0,0 +1,99 @@
|
||||
# ansible/group_vars/load_balancer.yml
|
||||
# Generated by Terraform for Load Balancer servers
|
||||
|
||||
# System Configuration
|
||||
ubuntu_version: "24.04"
|
||||
haproxy_version: "2.8"
|
||||
|
||||
# Load Balancer Configuration
|
||||
haproxy:
|
||||
global:
|
||||
maxconn: 4096
|
||||
log: "stdout local0"
|
||||
stats:
|
||||
socket: "/run/haproxy/admin.sock"
|
||||
timeout: "30s"
|
||||
level: "admin"
|
||||
|
||||
defaults:
|
||||
mode: "http"
|
||||
timeout:
|
||||
connect: "5s"
|
||||
client: "30s"
|
||||
server: "30s"
|
||||
retries: 3
|
||||
option:
|
||||
- "httplog"
|
||||
- "dontlognull"
|
||||
- "redispatch"
|
||||
|
||||
frontend:
|
||||
api_frontend:
|
||||
bind: "*:443 ssl crt /etc/ssl/certs/{{ ssl_certificate_name }}.pem"
|
||||
redirect: "scheme https if !{ ssl_fc }"
|
||||
default_backend: "vllm_backend"
|
||||
|
||||
stats_frontend:
|
||||
bind: "*:8404"
|
||||
stats:
|
||||
enable: true
|
||||
uri: "/stats"
|
||||
refresh: "30s"
|
||||
admin: "if TRUE"
|
||||
|
||||
backend:
|
||||
vllm_backend:
|
||||
balance: "roundrobin"
|
||||
option:
|
||||
- "httpchk GET /health"
|
||||
http_check: "expect status 200"
|
||||
servers: "{{ haproxy_backend_servers }}"
|
||||
|
||||
# SSL/TLS Configuration
|
||||
ssl_config:
|
||||
certificate_type: "{{ ssl_certificate_type | default('letsencrypt') }}"
|
||||
certificate_name: "{{ ssl_certificate_name | default('ai-api') }}"
|
||||
cipher_suite: "ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384"
|
||||
protocols: "TLSv1.2 TLSv1.3"
|
||||
hsts_enabled: true
|
||||
hsts_max_age: 31536000
|
||||
|
||||
# Security Configuration
|
||||
security:
|
||||
fail2ban_enabled: true
|
||||
rate_limiting:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 20
|
||||
|
||||
blocked_countries: [] # ISO country codes to block
|
||||
|
||||
headers:
|
||||
- "X-Frame-Options: DENY"
|
||||
- "X-Content-Type-Options: nosniff"
|
||||
- "X-XSS-Protection: 1; mode=block"
|
||||
- "Referrer-Policy: strict-origin-when-cross-origin"
|
||||
|
||||
# Health Check Configuration
|
||||
health_checks:
|
||||
backend_check_interval: "5s"
|
||||
backend_check_timeout: "3s"
|
||||
backend_rise: 2
|
||||
backend_fall: 3
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
access_log: "/var/log/haproxy/access.log"
|
||||
error_log: "/var/log/haproxy/error.log"
|
||||
log_level: "info"
|
||||
log_rotation:
|
||||
enabled: true
|
||||
frequency: "daily"
|
||||
retention: 30
|
||||
|
||||
# Monitoring
|
||||
monitoring:
|
||||
haproxy_exporter:
|
||||
enabled: true
|
||||
port: 8405
|
||||
stats_url: "http://localhost:8404/stats"
|
||||
132
ansible/inventory/production.yml
Normal file
132
ansible/inventory/production.yml
Normal file
@ -0,0 +1,132 @@
|
||||
# Production inventory for AI Infrastructure
|
||||
all:
|
||||
vars:
|
||||
ansible_user: ubuntu
|
||||
ansible_ssh_private_key_file: ~/.ssh/hetzner_key
|
||||
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
|
||||
# Environment settings
|
||||
environment: production
|
||||
project_name: ai-infrastructure
|
||||
|
||||
# Network configuration
|
||||
private_network_cidr: "10.0.0.0/16"
|
||||
gex44_subnet: "10.0.1.0/24"
|
||||
cloud_subnet: "10.0.2.0/24"
|
||||
|
||||
# Security settings
|
||||
ansible_vault_password_file: /opt/.vault-pass
|
||||
|
||||
children:
|
||||
# GPU servers (GEX44 dedicated servers)
|
||||
gex44:
|
||||
vars:
|
||||
# GPU configuration
|
||||
cuda_version: "12.3"
|
||||
gpu_type: "rtx_4000_ada"
|
||||
vram_size: 20480 # 20GB in MB
|
||||
|
||||
# vLLM configuration
|
||||
vllm_version: "0.3.0"
|
||||
vllm_port: 8000
|
||||
vllm_host: "0.0.0.0"
|
||||
vllm_gpu_memory_utilization: 0.85
|
||||
vllm_max_model_len: 4096
|
||||
vllm_tensor_parallel_size: 1
|
||||
|
||||
# Models configuration
|
||||
models_cache_dir: "/opt/vllm/models"
|
||||
models_to_download:
|
||||
- name: "mixtral-8x7b"
|
||||
repo: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
enabled: true
|
||||
- name: "llama2-70b"
|
||||
repo: "meta-llama/Llama-2-70b-chat-hf"
|
||||
enabled: false # Requires quantization
|
||||
- name: "codellama-34b"
|
||||
repo: "codellama/CodeLlama-34b-Instruct-hf"
|
||||
enabled: false
|
||||
|
||||
# Monitoring
|
||||
node_exporter_port: 9100
|
||||
nvidia_exporter_port: 9835
|
||||
|
||||
hosts:
|
||||
gex44-1:
|
||||
ansible_host: 10.0.1.10
|
||||
server_id: gex44-1
|
||||
gpu_index: 0
|
||||
vllm_model: "mixtral-8x7b"
|
||||
|
||||
gex44-2:
|
||||
ansible_host: 10.0.1.11
|
||||
server_id: gex44-2
|
||||
gpu_index: 1
|
||||
vllm_model: "mixtral-8x7b"
|
||||
|
||||
gex44-3:
|
||||
ansible_host: 10.0.1.12
|
||||
server_id: gex44-3
|
||||
gpu_index: 2
|
||||
vllm_model: "mixtral-8x7b"
|
||||
|
||||
# Cloud servers
|
||||
cloud_servers:
|
||||
vars:
|
||||
# Basic cloud server settings
|
||||
server_type: "cloud"
|
||||
monitoring_enabled: true
|
||||
|
||||
children:
|
||||
# Load balancers
|
||||
load_balancers:
|
||||
vars:
|
||||
haproxy_version: "2.4"
|
||||
haproxy_stats_port: 8404
|
||||
haproxy_stats_user: admin
|
||||
ssl_enabled: true
|
||||
|
||||
hosts:
|
||||
load-balancer:
|
||||
ansible_host: 10.0.2.10
|
||||
server_id: lb-1
|
||||
public_ip: "{{ load_balancer_public_ip | default('') }}"
|
||||
|
||||
# API gateways
|
||||
api_gateways:
|
||||
vars:
|
||||
nginx_version: "1.22"
|
||||
api_rate_limit: "100r/m"
|
||||
|
||||
hosts:
|
||||
api-gateway:
|
||||
ansible_host: 10.0.2.11
|
||||
server_id: api-gw-1
|
||||
public_ip: "{{ api_gateway_public_ip | default('') }}"
|
||||
|
||||
# Monitoring servers
|
||||
monitoring:
|
||||
vars:
|
||||
prometheus_version: "2.47"
|
||||
grafana_version: "10.2"
|
||||
prometheus_retention: "30d"
|
||||
prometheus_port: 9090
|
||||
grafana_port: 3000
|
||||
alertmanager_port: 9093
|
||||
|
||||
hosts:
|
||||
monitoring:
|
||||
ansible_host: 10.0.2.12
|
||||
server_id: monitoring-1
|
||||
public_ip: "{{ monitoring_public_ip | default('') }}"
|
||||
|
||||
# Autoscaler (runs on monitoring server)
|
||||
autoscaler:
|
||||
hosts:
|
||||
monitoring:
|
||||
autoscaler_enabled: true
|
||||
min_gex44_count: 1
|
||||
max_gex44_count: 10
|
||||
scale_up_threshold: 0.8
|
||||
scale_down_threshold: 0.3
|
||||
140
ansible/playbooks/gex44-setup.yml
Normal file
140
ansible/playbooks/gex44-setup.yml
Normal file
@ -0,0 +1,140 @@
|
||||
# GEX44 GPU servers configuration playbook
|
||||
---
|
||||
- name: Configure GEX44 GPU servers for AI inference
|
||||
hosts: gex44
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
# Override for specific deployment targets
|
||||
target_model: "{{ vllm_model | default(default_model) }}"
|
||||
|
||||
pre_tasks:
|
||||
- name: Verify GPU hardware
|
||||
shell: lspci | grep -i nvidia
|
||||
register: gpu_check
|
||||
failed_when: gpu_check.rc != 0
|
||||
|
||||
- name: Display GPU information
|
||||
debug:
|
||||
msg: "Detected GPU: {{ gpu_check.stdout }}"
|
||||
|
||||
- name: Check available disk space
|
||||
setup:
|
||||
gather_subset:
|
||||
- hardware
|
||||
|
||||
- name: Ensure sufficient disk space for models
|
||||
assert:
|
||||
that:
|
||||
- ansible_mounts | selectattr('mount', 'equalto', '/') | map(attribute='size_available') | first > 200000000000
|
||||
fail_msg: "Insufficient disk space. Need at least 200GB free for models."
|
||||
success_msg: "Sufficient disk space available"
|
||||
|
||||
roles:
|
||||
- cuda
|
||||
- docker
|
||||
- vllm
|
||||
- monitoring-agent
|
||||
- security
|
||||
|
||||
post_tasks:
|
||||
- name: Verify CUDA installation
|
||||
shell: nvidia-smi
|
||||
register: nvidia_smi_output
|
||||
failed_when: nvidia_smi_output.rc != 0
|
||||
|
||||
- name: Display CUDA information
|
||||
debug:
|
||||
msg: "{{ nvidia_smi_output.stdout }}"
|
||||
|
||||
- name: Test GPU accessibility from Python
|
||||
shell: |
|
||||
python3 -c "
|
||||
import torch
|
||||
print(f'CUDA available: {torch.cuda.is_available()}')
|
||||
if torch.cuda.is_available():
|
||||
print(f'CUDA devices: {torch.cuda.device_count()}')
|
||||
print(f'Current device: {torch.cuda.current_device()}')
|
||||
print(f'Device name: {torch.cuda.get_device_name(0)}')
|
||||
print(f'Device memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
|
||||
"
|
||||
register: torch_cuda_test
|
||||
|
||||
- name: Display PyTorch CUDA test results
|
||||
debug:
|
||||
msg: "{{ torch_cuda_test.stdout }}"
|
||||
|
||||
- name: Download and cache target model
|
||||
include_role:
|
||||
name: vllm
|
||||
tasks_from: download_model
|
||||
vars:
|
||||
model_config: "{{ available_models[target_model] }}"
|
||||
|
||||
- name: Start vLLM service with target model
|
||||
systemd:
|
||||
name: vllm-api
|
||||
state: started
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
environment:
|
||||
VLLM_MODEL: "{{ target_model }}"
|
||||
|
||||
- name: Wait for vLLM service to be ready
|
||||
uri:
|
||||
url: "http://localhost:{{ vllm_port }}/health"
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_check
|
||||
until: health_check.status == 200
|
||||
retries: 30
|
||||
delay: 10
|
||||
|
||||
- name: Test inference endpoint
|
||||
uri:
|
||||
url: "http://localhost:{{ vllm_port }}/v1/models"
|
||||
method: GET
|
||||
return_content: yes
|
||||
register: models_response
|
||||
|
||||
- name: Display available models
|
||||
debug:
|
||||
msg: "Available models: {{ models_response.json.data | map(attribute='id') | list }}"
|
||||
|
||||
- name: Test inference with simple prompt
|
||||
uri:
|
||||
url: "http://localhost:{{ vllm_port }}/v1/chat/completions"
|
||||
method: POST
|
||||
body_format: json
|
||||
body:
|
||||
model: "{{ target_model }}"
|
||||
messages:
|
||||
- role: "user"
|
||||
content: "Hello! Please respond with 'GPU server {{ inventory_hostname }} is working correctly.'"
|
||||
max_tokens: 50
|
||||
temperature: 0.1
|
||||
status_code: 200
|
||||
register: inference_test
|
||||
|
||||
- name: Display inference test result
|
||||
debug:
|
||||
msg: "Inference test: {{ inference_test.json.choices[0].message.content }}"
|
||||
|
||||
- name: Register server in load balancer (if using dynamic registration)
|
||||
uri:
|
||||
url: "http://{{ hostvars[groups['load_balancers'][0]]['ansible_host'] }}:8404/stats"
|
||||
method: GET
|
||||
delegate_to: "{{ groups['load_balancers'][0] }}"
|
||||
ignore_errors: yes
|
||||
|
||||
handlers:
|
||||
- name: restart nvidia-persistenced
|
||||
systemd:
|
||||
name: nvidia-persistenced
|
||||
state: restarted
|
||||
|
||||
- name: restart vllm-api
|
||||
systemd:
|
||||
name: vllm-api
|
||||
state: restarted
|
||||
70
ansible/playbooks/site.yml
Normal file
70
ansible/playbooks/site.yml
Normal file
@ -0,0 +1,70 @@
|
||||
# Main site playbook for AI Infrastructure
|
||||
---
|
||||
- name: Configure all infrastructure
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
pre_tasks:
|
||||
- name: Update package cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Install common packages
|
||||
apt:
|
||||
name: "{{ common_packages }}"
|
||||
state: present
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Set timezone
|
||||
timezone:
|
||||
name: "{{ timezone }}"
|
||||
|
||||
- name: Configure NTP
|
||||
apt:
|
||||
name: ntp
|
||||
state: present
|
||||
notify: restart ntp
|
||||
|
||||
roles:
|
||||
- common
|
||||
|
||||
handlers:
|
||||
- name: restart ntp
|
||||
systemd:
|
||||
name: ntp
|
||||
state: restarted
|
||||
|
||||
# Configure GEX44 GPU servers
|
||||
- import_playbook: gex44-setup.yml
|
||||
|
||||
# Configure load balancers
|
||||
- import_playbook: load-balancer-setup.yml
|
||||
|
||||
# Configure API gateways
|
||||
- import_playbook: api-gateway-setup.yml
|
||||
|
||||
# Configure monitoring
|
||||
- import_playbook: monitoring-setup.yml
|
||||
|
||||
# Final validation
|
||||
- name: Validate infrastructure
|
||||
hosts: all
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Check service status
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
state: started
|
||||
loop:
|
||||
- ssh
|
||||
- ntp
|
||||
check_mode: yes
|
||||
|
||||
- name: Test connectivity between servers
|
||||
ping:
|
||||
delegate_to: "{{ item }}"
|
||||
loop: "{{ groups['all'] }}"
|
||||
when: item != inventory_hostname
|
||||
31
ansible/requirements.yml
Normal file
31
ansible/requirements.yml
Normal file
@ -0,0 +1,31 @@
|
||||
# Ansible Galaxy requirements for AI Infrastructure
|
||||
|
||||
collections:
|
||||
- name: community.general
|
||||
version: ">=7.0.0"
|
||||
- name: community.docker
|
||||
version: ">=3.0.0"
|
||||
- name: ansible.posix
|
||||
version: ">=1.5.0"
|
||||
- name: community.crypto
|
||||
version: ">=2.0.0"
|
||||
- name: community.mysql
|
||||
version: ">=3.0.0"
|
||||
- name: prometheus.prometheus
|
||||
version: ">=0.13.0"
|
||||
- name: grafana.grafana
|
||||
version: ">=2.0.0"
|
||||
|
||||
roles:
|
||||
- name: geerlingguy.docker
|
||||
version: ">=6.0.0"
|
||||
- name: geerlingguy.pip
|
||||
version: ">=2.0.0"
|
||||
- name: geerlingguy.nodejs
|
||||
version: ">=6.0.0"
|
||||
- name: cloudalchemy.prometheus
|
||||
version: ">=2.17.0"
|
||||
- name: cloudalchemy.grafana
|
||||
version: ">=0.22.0"
|
||||
- name: cloudalchemy.node_exporter
|
||||
version: ">=3.0.0"
|
||||
117
ansible/roles/ssl-certificates/tasks/generate_certificate.yml
Normal file
117
ansible/roles/ssl-certificates/tasks/generate_certificate.yml
Normal file
@ -0,0 +1,117 @@
|
||||
# ansible/roles/ssl-certificates/tasks/generate_certificate.yml
|
||||
# Generate individual SSL certificate based on requirements
|
||||
|
||||
---
|
||||
- name: Set certificate facts
|
||||
set_fact:
|
||||
cert_name: "{{ cert_config.name }}"
|
||||
cert_type: "{{ cert_config.type }}"
|
||||
cert_domains: "{{ cert_config.domains }}"
|
||||
dns_provider: "{{ cert_config.dns_provider | default('hetzner') }}"
|
||||
key_size: "{{ cert_config.key_size | default(2048) }}"
|
||||
cert_tags: "{{ cert_config.tags | default([]) }}"
|
||||
|
||||
- name: Generate Let's Encrypt certificate
|
||||
command: >
|
||||
certbot certonly
|
||||
--dns-hetzner
|
||||
--dns-hetzner-credentials /etc/letsencrypt/hetzner-dns.ini
|
||||
--dns-hetzner-propagation-seconds 60
|
||||
--non-interactive
|
||||
--agree-tos
|
||||
--email "{{ ssl_admin_email | default('admin@company.com') }}"
|
||||
--cert-name "{{ cert_name }}"
|
||||
{% for domain in cert_domains %}
|
||||
-d "{{ domain }}"
|
||||
{% endfor %}
|
||||
--key-type rsa
|
||||
--rsa-key-size "{{ key_size }}"
|
||||
when:
|
||||
- cert_type == "letsencrypt"
|
||||
- dns_provider == "hetzner"
|
||||
register: letsencrypt_result
|
||||
failed_when:
|
||||
- letsencrypt_result.rc != 0
|
||||
- "'already exists' not in letsencrypt_result.stderr"
|
||||
|
||||
- name: Generate self-signed certificate for development
|
||||
block:
|
||||
- name: Create private key
|
||||
openssl_privatekey:
|
||||
path: "/etc/ssl/private/{{ cert_name }}.key"
|
||||
size: "{{ key_size }}"
|
||||
type: RSA
|
||||
mode: '0600'
|
||||
|
||||
- name: Create certificate signing request
|
||||
openssl_csr:
|
||||
path: "/etc/ssl/requests/{{ cert_name }}.csr"
|
||||
privatekey_path: "/etc/ssl/private/{{ cert_name }}.key"
|
||||
common_name: "{{ cert_domains[0] }}"
|
||||
subject_alt_name: "{{ cert_domains | map('regex_replace', '^', 'DNS:') | list }}"
|
||||
organization_name: "Company Development"
|
||||
country_name: "FR"
|
||||
|
||||
- name: Create self-signed certificate
|
||||
openssl_certificate:
|
||||
path: "/etc/ssl/certs/{{ cert_name }}.crt"
|
||||
privatekey_path: "/etc/ssl/private/{{ cert_name }}.key"
|
||||
csr_path: "/etc/ssl/requests/{{ cert_name }}.csr"
|
||||
provider: selfsigned
|
||||
selfsigned_not_after: "+365d"
|
||||
mode: '0644'
|
||||
when: cert_type == "self-signed"
|
||||
|
||||
- name: Handle commercial certificate placeholder
|
||||
block:
|
||||
- name: Create placeholder for commercial certificate
|
||||
copy:
|
||||
content: |
|
||||
# Commercial certificate placeholder for {{ cert_name }}
|
||||
# Domains: {{ cert_domains | join(', ') }}
|
||||
# Tags: {{ cert_tags | join(', ') }}
|
||||
#
|
||||
# Place your commercial certificate files at:
|
||||
# Certificate: /etc/ssl/certs/{{ cert_name }}.crt
|
||||
# Private Key: /etc/ssl/private/{{ cert_name }}.key
|
||||
# CA Bundle: /etc/ssl/certs/{{ cert_name }}-ca-bundle.crt
|
||||
dest: "/etc/ssl/certs/{{ cert_name }}-README.txt"
|
||||
mode: '0644'
|
||||
|
||||
- name: Check if commercial certificate exists
|
||||
stat:
|
||||
path: "/etc/ssl/certs/{{ cert_name }}.crt"
|
||||
register: commercial_cert
|
||||
|
||||
- name: Warning for missing commercial certificate
|
||||
debug:
|
||||
msg: "WARNING: Commercial certificate {{ cert_name }} not found. Please install manually."
|
||||
when: not commercial_cert.stat.exists
|
||||
when: cert_type == "commercial"
|
||||
|
||||
- name: Create combined PEM file for HAProxy
|
||||
shell: |
|
||||
cat /etc/ssl/certs/{{ cert_name }}.crt \
|
||||
/etc/ssl/private/{{ cert_name }}.key \
|
||||
> /etc/ssl/certs/{{ cert_name }}.pem
|
||||
when:
|
||||
- cert_type in ['letsencrypt', 'self-signed']
|
||||
- "'load_balancer' in group_names"
|
||||
notify: restart haproxy
|
||||
|
||||
- name: Set certificate file permissions
|
||||
file:
|
||||
path: "{{ item.path }}"
|
||||
owner: "{{ item.owner }}"
|
||||
group: "{{ item.group }}"
|
||||
mode: "{{ item.mode }}"
|
||||
loop:
|
||||
- { path: "/etc/ssl/certs/{{ cert_name }}.pem", owner: "root", group: "haproxy", mode: "0640" }
|
||||
- { path: "/etc/ssl/private/{{ cert_name }}.key", owner: "root", group: "ssl-cert", mode: "0640" }
|
||||
when:
|
||||
- cert_type in ['letsencrypt', 'self-signed']
|
||||
- "'load_balancer' in group_names"
|
||||
|
||||
- name: Add certificate to inventory facts
|
||||
set_fact:
|
||||
deployed_certificates: "{{ deployed_certificates | default([]) + [cert_config] }}"
|
||||
58
ansible/roles/ssl-certificates/tasks/main.yml
Normal file
58
ansible/roles/ssl-certificates/tasks/main.yml
Normal file
@ -0,0 +1,58 @@
|
||||
# ansible/roles/ssl-certificates/tasks/main.yml
|
||||
# SSL Certificate management role
|
||||
|
||||
---
|
||||
- name: Install certificate management tools
|
||||
package:
|
||||
name:
|
||||
- certbot
|
||||
- python3-certbot-dns-hetzner
|
||||
- openssl
|
||||
state: present
|
||||
when: ansible_os_family == "Debian" and ansible_distribution_version == "24.04"
|
||||
|
||||
- name: Create SSL directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /etc/ssl/certs
|
||||
- /etc/ssl/private
|
||||
- /etc/ssl/requests
|
||||
- /var/lib/certbot
|
||||
|
||||
- name: Generate SSL certificates per environment requirements
|
||||
include_tasks: generate_certificate.yml
|
||||
vars:
|
||||
cert_config: "{{ item }}"
|
||||
loop: "{{ ssl_certificates }}"
|
||||
when: ssl_certificates is defined
|
||||
|
||||
- name: Setup certificate renewal cron
|
||||
cron:
|
||||
name: "SSL certificate renewal"
|
||||
minute: "0"
|
||||
hour: "2"
|
||||
job: "/usr/bin/certbot renew --quiet && systemctl reload haproxy"
|
||||
user: root
|
||||
when: auto_renewal_enabled | default(true)
|
||||
|
||||
- name: Configure Hetzner DNS API for certificate validation
|
||||
template:
|
||||
src: hetzner-dns.ini.j2
|
||||
dest: /etc/letsencrypt/hetzner-dns.ini
|
||||
mode: '0600'
|
||||
owner: root
|
||||
group: root
|
||||
when:
|
||||
- dns_provider == "hetzner"
|
||||
- hetzner_dns_token is defined
|
||||
no_log: true
|
||||
|
||||
- name: Setup certificate monitoring
|
||||
template:
|
||||
src: cert-monitor.sh.j2
|
||||
dest: /usr/local/bin/cert-monitor.sh
|
||||
mode: '0755'
|
||||
when: monitoring_enabled | default(true)
|
||||
207
ansible/roles/vllm/tasks/main.yml
Normal file
207
ansible/roles/vllm/tasks/main.yml
Normal file
@ -0,0 +1,207 @@
|
||||
# vLLM role main tasks
|
||||
---
|
||||
- name: Create vLLM user
|
||||
user:
|
||||
name: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: "{{ vllm_home }}"
|
||||
create_home: yes
|
||||
|
||||
- name: Create vLLM directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
loop:
|
||||
- "{{ vllm_home }}"
|
||||
- "{{ models_base_dir }}"
|
||||
- "{{ models_cache_dir }}"
|
||||
- "{{ huggingface_cache_dir }}"
|
||||
- "{{ vllm_log_dir }}"
|
||||
- "{{ temp_dir }}"
|
||||
|
||||
- name: Install Python dependencies for vLLM
|
||||
pip:
|
||||
name:
|
||||
- torch>=2.1.0
|
||||
- transformers>=4.36.0
|
||||
- accelerate>=0.24.0
|
||||
- sentencepiece>=0.1.99
|
||||
- protobuf>=3.20.0
|
||||
- huggingface-hub>=0.19.0
|
||||
- tokenizers>=0.15.0
|
||||
extra_args: --index-url https://download.pytorch.org/whl/cu121
|
||||
executable: pip3
|
||||
|
||||
- name: Install vLLM
|
||||
pip:
|
||||
name: "vllm[cuda]=={{ vllm_version }}"
|
||||
executable: pip3
|
||||
|
||||
- name: Install additional dependencies
|
||||
pip:
|
||||
name:
|
||||
- fastapi>=0.104.0
|
||||
- uvicorn>=0.24.0
|
||||
- prometheus-client>=0.19.0
|
||||
- psutil>=5.9.0
|
||||
executable: pip3
|
||||
|
||||
- name: Create vLLM configuration directory
|
||||
file:
|
||||
path: /etc/vllm
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Generate vLLM configuration
|
||||
template:
|
||||
src: vllm-config.env.j2
|
||||
dest: /etc/vllm/config.env
|
||||
owner: root
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0640'
|
||||
notify: restart vllm-api
|
||||
|
||||
- name: Create vLLM systemd service
|
||||
template:
|
||||
src: vllm-api.service.j2
|
||||
dest: /etc/systemd/system/vllm-api.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart vllm-api
|
||||
|
||||
- name: Create vLLM startup script
|
||||
template:
|
||||
src: start-vllm.sh.j2
|
||||
dest: "{{ vllm_home }}/start-vllm.sh"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Create model download script
|
||||
template:
|
||||
src: download-model.py.j2
|
||||
dest: "{{ vllm_home }}/download-model.py"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Create health check script
|
||||
template:
|
||||
src: health-check.sh.j2
|
||||
dest: "{{ vllm_home }}/health-check.sh"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Configure logrotate for vLLM
|
||||
template:
|
||||
src: vllm-logrotate.j2
|
||||
dest: /etc/logrotate.d/vllm
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Setup tmpfs for temporary model files
|
||||
mount:
|
||||
path: "{{ temp_dir }}"
|
||||
src: tmpfs
|
||||
fstype: tmpfs
|
||||
opts: "size={{ temp_dir_size }},uid={{ vllm_user }},gid={{ vllm_group }}"
|
||||
state: mounted
|
||||
when: temp_dir_size is defined
|
||||
|
||||
- name: Create model management script
|
||||
template:
|
||||
src: manage-models.sh.j2
|
||||
dest: "{{ vllm_home }}/manage-models.sh"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Setup GPU memory management
|
||||
template:
|
||||
src: gpu-setup.sh.j2
|
||||
dest: "{{ vllm_home }}/gpu-setup.sh"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
notify: run gpu setup
|
||||
|
||||
- name: Configure vLLM environment variables
|
||||
template:
|
||||
src: vllm.env.j2
|
||||
dest: /etc/environment.d/vllm.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create vLLM metrics exporter
|
||||
template:
|
||||
src: vllm-metrics.py.j2
|
||||
dest: "{{ vllm_home }}/vllm-metrics.py"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Setup vLLM metrics service
|
||||
template:
|
||||
src: vllm-metrics.service.j2
|
||||
dest: /etc/systemd/system/vllm-metrics.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart vllm-metrics
|
||||
|
||||
- name: Enable and start vLLM services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
loop:
|
||||
- vllm-api
|
||||
- vllm-metrics
|
||||
|
||||
- name: Download default model if specified
|
||||
include_tasks: download_model.yml
|
||||
vars:
|
||||
model_name: "{{ default_model }}"
|
||||
model_config: "{{ available_models[default_model] }}"
|
||||
when:
|
||||
- default_model is defined
|
||||
- available_models[default_model].enabled | default(true)
|
||||
|
||||
- name: Create model validation script
|
||||
template:
|
||||
src: validate-model.py.j2
|
||||
dest: "{{ vllm_home }}/validate-model.py"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Setup model update cron job
|
||||
cron:
|
||||
name: "Check for model updates"
|
||||
minute: "0"
|
||||
hour: "2"
|
||||
job: "{{ vllm_home }}/manage-models.sh update >> {{ vllm_log_dir }}/model-updates.log 2>&1"
|
||||
user: "{{ vllm_user }}"
|
||||
when: auto_update_models | default(false)
|
||||
|
||||
- name: Configure firewall for vLLM
|
||||
ufw:
|
||||
rule: allow
|
||||
port: "{{ vllm_port }}"
|
||||
proto: tcp
|
||||
src: "{{ cloud_subnet }}"
|
||||
comment: "vLLM API access from cloud servers"
|
||||
when: firewall_enabled | default(true)
|
||||
247
ansible/roles/vllm/tasks/updated_main.yml
Normal file
247
ansible/roles/vllm/tasks/updated_main.yml
Normal file
@ -0,0 +1,247 @@
|
||||
# vLLM role main tasks - Updated with latest vLLM practices (2024)
|
||||
---
|
||||
- name: Create vLLM user
|
||||
user:
|
||||
name: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: "{{ vllm_home }}"
|
||||
create_home: yes
|
||||
|
||||
- name: Create vLLM directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
loop:
|
||||
- "{{ vllm_home }}"
|
||||
- "{{ models_base_dir }}"
|
||||
- "{{ models_cache_dir }}"
|
||||
- "{{ huggingface_cache_dir }}"
|
||||
- "{{ vllm_log_dir }}"
|
||||
- "{{ temp_dir }}"
|
||||
|
||||
# Updated installation using latest vLLM with nightly wheels
|
||||
- name: Install latest PyTorch with CUDA support
|
||||
pip:
|
||||
name:
|
||||
- torch>=2.5.0
|
||||
- torchvision>=0.20.0
|
||||
- torchaudio>=2.5.0
|
||||
extra_args: --index-url https://download.pytorch.org/whl/cu121
|
||||
executable: pip3
|
||||
|
||||
- name: Install latest vLLM from nightly wheels
|
||||
pip:
|
||||
name: vllm
|
||||
extra_args: >-
|
||||
--pre
|
||||
--extra-index-url https://wheels.vllm.ai/nightly
|
||||
--torch-backend=auto
|
||||
executable: pip3
|
||||
|
||||
- name: Install additional vLLM dependencies for production
|
||||
pip:
|
||||
name:
|
||||
- transformers>=4.46.0
|
||||
- accelerate>=0.34.0
|
||||
- sentencepiece>=0.2.0
|
||||
- protobuf>=5.28.0
|
||||
- huggingface-hub>=0.25.0
|
||||
- tokenizers>=0.20.0
|
||||
- fastapi>=0.115.0
|
||||
- uvicorn[standard]>=0.31.0
|
||||
- pydantic>=2.9.0
|
||||
- prometheus-client>=0.21.0
|
||||
- psutil>=6.1.0
|
||||
- ray[serve]>=2.39.0 # For distributed serving
|
||||
executable: pip3
|
||||
|
||||
# Install TorchAO for advanced quantization support
|
||||
- name: Install TorchAO nightly for quantization
|
||||
pip:
|
||||
name: torchao
|
||||
extra_args: >-
|
||||
--pre
|
||||
--index-url https://download.pytorch.org/whl/nightly/cu121
|
||||
executable: pip3
|
||||
when: enable_quantization | default(true)
|
||||
|
||||
- name: Create vLLM configuration directory
|
||||
file:
|
||||
path: /etc/vllm
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Generate updated vLLM configuration
|
||||
template:
|
||||
src: vllm-config-2024.env.j2
|
||||
dest: /etc/vllm/config.env
|
||||
owner: root
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0640'
|
||||
notify: restart vllm-api
|
||||
|
||||
- name: Create modern vLLM systemd service
|
||||
template:
|
||||
src: vllm-api-2024.service.j2
|
||||
dest: /etc/systemd/system/vllm-api.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart vllm-api
|
||||
|
||||
- name: Create vLLM startup script with latest options
|
||||
template:
|
||||
src: start-vllm-2024.sh.j2
|
||||
dest: "{{ vllm_home }}/start-vllm.sh"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Create enhanced model download script
|
||||
template:
|
||||
src: download-model-2024.py.j2
|
||||
dest: "{{ vllm_home }}/download-model.py"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Create production health check script
|
||||
template:
|
||||
src: health-check-2024.sh.j2
|
||||
dest: "{{ vllm_home }}/health-check.sh"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Configure enhanced logrotate for vLLM
|
||||
template:
|
||||
src: vllm-logrotate-2024.j2
|
||||
dest: /etc/logrotate.d/vllm
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Setup tmpfs for temporary model files (if enabled)
|
||||
mount:
|
||||
path: "{{ temp_dir }}"
|
||||
src: tmpfs
|
||||
fstype: tmpfs
|
||||
opts: "size={{ temp_dir_size }},uid={{ vllm_user }},gid={{ vllm_group }}"
|
||||
state: mounted
|
||||
when: temp_dir_size is defined
|
||||
|
||||
- name: Create model management script with latest HF integration
|
||||
template:
|
||||
src: manage-models-2024.sh.j2
|
||||
dest: "{{ vllm_home }}/manage-models.sh"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Setup enhanced GPU configuration
|
||||
template:
|
||||
src: gpu-setup-2024.sh.j2
|
||||
dest: "{{ vllm_home }}/gpu-setup.sh"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
notify: run gpu setup
|
||||
|
||||
- name: Configure vLLM environment variables for 2024
|
||||
template:
|
||||
src: vllm-2024.env.j2
|
||||
dest: /etc/environment.d/vllm.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create enhanced vLLM metrics exporter
|
||||
template:
|
||||
src: vllm-metrics-2024.py.j2
|
||||
dest: "{{ vllm_home }}/vllm-metrics.py"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Setup vLLM metrics service with latest endpoints
|
||||
template:
|
||||
src: vllm-metrics-2024.service.j2
|
||||
dest: /etc/systemd/system/vllm-metrics.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart vllm-metrics
|
||||
|
||||
- name: Enable and start vLLM services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
loop:
|
||||
- vllm-api
|
||||
- vllm-metrics
|
||||
|
||||
- name: Download default model if specified
|
||||
include_tasks: download_model_2024.yml
|
||||
vars:
|
||||
model_name: "{{ default_model }}"
|
||||
model_config: "{{ available_models[default_model] }}"
|
||||
when:
|
||||
- default_model is defined
|
||||
- available_models[default_model].enabled | default(true)
|
||||
|
||||
- name: Create enhanced model validation script
|
||||
template:
|
||||
src: validate-model-2024.py.j2
|
||||
dest: "{{ vllm_home }}/validate-model.py"
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Setup model update cron job (with safety checks)
|
||||
cron:
|
||||
name: "Check for model updates"
|
||||
minute: "0"
|
||||
hour: "2"
|
||||
job: "{{ vllm_home }}/manage-models.sh update >> {{ vllm_log_dir }}/model-updates.log 2>&1"
|
||||
user: "{{ vllm_user }}"
|
||||
when: auto_update_models | default(false)
|
||||
|
||||
- name: Configure firewall for vLLM
|
||||
ufw:
|
||||
rule: allow
|
||||
port: "{{ vllm_port }}"
|
||||
proto: tcp
|
||||
src: "{{ cloud_subnet }}"
|
||||
comment: "vLLM API access from cloud servers"
|
||||
when: firewall_enabled | default(true)
|
||||
|
||||
# New: Setup vLLM production stack integration (optional)
|
||||
- name: Install vLLM production stack Helm chart (if enabled)
|
||||
include_tasks: setup_production_stack.yml
|
||||
when: vllm_production_stack_enabled | default(false)
|
||||
|
||||
# New: Configure expert parallelism for large models
|
||||
- name: Configure expert parallelism settings
|
||||
template:
|
||||
src: expert-parallel-2024.conf.j2
|
||||
dest: /etc/vllm/expert-parallel.conf
|
||||
owner: "{{ vllm_user }}"
|
||||
group: "{{ vllm_group }}"
|
||||
mode: '0644'
|
||||
when: enable_expert_parallel | default(false)
|
||||
notify: restart vllm-api
|
||||
|
||||
# New: Setup Ray cluster for distributed serving
|
||||
- name: Setup Ray cluster for distributed vLLM
|
||||
include_tasks: setup_ray_cluster.yml
|
||||
when: enable_distributed_serving | default(false)
|
||||
71
ansible/roles/vllm/templates/vllm-api.service.j2
Normal file
71
ansible/roles/vllm/templates/vllm-api.service.j2
Normal file
@ -0,0 +1,71 @@
|
||||
[Unit]
|
||||
Description=vLLM API Server for {{ inventory_hostname }}
|
||||
After=network.target nvidia-persistenced.service
|
||||
Requires=nvidia-persistenced.service
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=exec
|
||||
User={{ vllm_user }}
|
||||
Group={{ vllm_group }}
|
||||
WorkingDirectory={{ vllm_home }}
|
||||
|
||||
# Environment configuration
|
||||
Environment=CUDA_VISIBLE_DEVICES=0
|
||||
Environment=NCCL_DEBUG=INFO
|
||||
Environment=PYTHONPATH={{ vllm_home }}
|
||||
Environment=HF_HOME={{ huggingface_cache_dir }}
|
||||
Environment=TRANSFORMERS_CACHE={{ huggingface_cache_dir }}/transformers
|
||||
Environment=HF_DATASETS_CACHE={{ huggingface_cache_dir }}/datasets
|
||||
EnvironmentFile=/etc/vllm/config.env
|
||||
|
||||
# Service configuration
|
||||
ExecStartPre=/bin/bash {{ vllm_home }}/gpu-setup.sh
|
||||
ExecStart=/usr/local/bin/python -m vllm.entrypoints.openai.api_server \
|
||||
--model {{ models_base_dir }}/${VLLM_MODEL:-{{ default_model }}} \
|
||||
--host {{ vllm_host }} \
|
||||
--port {{ vllm_port }} \
|
||||
--tensor-parallel-size {{ vllm_tensor_parallel_size }} \
|
||||
--pipeline-parallel-size {{ vllm_pipeline_parallel_size }} \
|
||||
--gpu-memory-utilization {{ vllm_gpu_memory_utilization }} \
|
||||
--max-model-len {{ vllm_max_model_len }} \
|
||||
--max-num-batched-tokens {{ vllm_max_num_batched_tokens }} \
|
||||
--max-num-seqs {{ vllm_max_num_seqs }} \
|
||||
--block-size {{ vllm_block_size }} \
|
||||
--swap-space {{ vllm_swap_space }} \
|
||||
--disable-log-requests \
|
||||
--served-model-name ${VLLM_MODEL:-{{ default_model }}} \
|
||||
--chat-template ${CHAT_TEMPLATE:-auto}
|
||||
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
Restart=always
|
||||
RestartSec=30
|
||||
|
||||
# Resource limits
|
||||
MemoryMax=45G
|
||||
MemoryHigh=40G
|
||||
LimitNOFILE=65536
|
||||
LimitNPROC=32768
|
||||
|
||||
# Security
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ReadWritePaths={{ vllm_home }}
|
||||
ReadWritePaths={{ models_base_dir }}
|
||||
ReadWritePaths={{ models_cache_dir }}
|
||||
ReadWritePaths={{ huggingface_cache_dir }}
|
||||
ReadWritePaths={{ vllm_log_dir }}
|
||||
ReadWritePaths={{ temp_dir }}
|
||||
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=vllm-api
|
||||
|
||||
# Startup timeout (model loading can take time)
|
||||
TimeoutStartSec=600
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
84
ansible/roles/vllm/templates/vllm-config.env.j2
Normal file
84
ansible/roles/vllm/templates/vllm-config.env.j2
Normal file
@ -0,0 +1,84 @@
|
||||
# vLLM Configuration Environment Variables
|
||||
# Generated by Ansible for {{ inventory_hostname }}
|
||||
|
||||
# Model configuration
|
||||
VLLM_MODEL={{ default_model }}
|
||||
VLLM_MODEL_PATH={{ models_base_dir }}/${VLLM_MODEL}
|
||||
CHAT_TEMPLATE=auto
|
||||
|
||||
# Server configuration
|
||||
VLLM_HOST={{ vllm_host }}
|
||||
VLLM_PORT={{ vllm_port }}
|
||||
VLLM_WORKERS={{ vllm_workers }}
|
||||
VLLM_LOG_LEVEL={{ vllm_log_level }}
|
||||
|
||||
# Performance configuration
|
||||
VLLM_GPU_MEMORY_UTILIZATION={{ vllm_gpu_memory_utilization }}
|
||||
VLLM_MAX_MODEL_LEN={{ vllm_max_model_len }}
|
||||
VLLM_MAX_NUM_BATCHED_TOKENS={{ vllm_max_num_batched_tokens }}
|
||||
VLLM_MAX_NUM_SEQS={{ vllm_max_num_seqs }}
|
||||
VLLM_TENSOR_PARALLEL_SIZE={{ vllm_tensor_parallel_size }}
|
||||
VLLM_PIPELINE_PARALLEL_SIZE={{ vllm_pipeline_parallel_size }}
|
||||
VLLM_BLOCK_SIZE={{ vllm_block_size }}
|
||||
VLLM_SWAP_SPACE={{ vllm_swap_space }}
|
||||
|
||||
# CUDA configuration
|
||||
CUDA_VISIBLE_DEVICES=0
|
||||
CUDA_LAUNCH_BLOCKING=0
|
||||
NCCL_DEBUG=WARN
|
||||
NCCL_P2P_DISABLE=1
|
||||
|
||||
# HuggingFace configuration
|
||||
HF_HOME={{ huggingface_cache_dir }}
|
||||
TRANSFORMERS_CACHE={{ huggingface_cache_dir }}/transformers
|
||||
HF_DATASETS_CACHE={{ huggingface_cache_dir }}/datasets
|
||||
HF_DATASETS_OFFLINE=0
|
||||
TRANSFORMERS_OFFLINE=0
|
||||
|
||||
# Python configuration
|
||||
PYTHONPATH={{ vllm_home }}
|
||||
PYTHONUNBUFFERED=1
|
||||
PYTHONDONTWRITEBYTECODE=1
|
||||
|
||||
# Logging configuration
|
||||
VLLM_LOG_DIR={{ vllm_log_dir }}
|
||||
VLLM_LOG_MAX_SIZE={{ vllm_log_max_size }}
|
||||
VLLM_LOG_MAX_FILES={{ vllm_log_max_files }}
|
||||
|
||||
# Performance monitoring
|
||||
PROMETHEUS_MULTIPROC_DIR=/tmp/vllm_metrics
|
||||
VLLM_METRICS_ENABLED=true
|
||||
VLLM_METRICS_PORT=9000
|
||||
|
||||
# Memory management
|
||||
VLLM_USE_MODELSCOPE=false
|
||||
VLLM_ATTENTION_BACKEND=FLASH_ATTN
|
||||
VLLM_FLASH_ATTN_V2_ENABLED=true
|
||||
|
||||
# Tokenizer configuration
|
||||
TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# Security
|
||||
VLLM_DISABLE_CUSTOM_ALL_REDUCE=true
|
||||
VLLM_ALLOW_DEPRECATED_LEGACY_API=false
|
||||
|
||||
# Development (only for non-production)
|
||||
{% if environment != 'production' %}
|
||||
VLLM_DEBUG=false
|
||||
VLLM_TRACE_FUNCTION=false
|
||||
{% endif %}
|
||||
|
||||
# Model-specific configurations
|
||||
{% if default_model == 'mixtral-8x7b' %}
|
||||
# Mixtral-8x7B specific optimizations
|
||||
VLLM_USE_XFORMERS=true
|
||||
VLLM_ENABLE_CHUNKED_PREFILL=true
|
||||
{% elif default_model == 'llama2-70b' %}
|
||||
# Llama2-70B specific optimizations
|
||||
VLLM_QUANTIZATION=awq
|
||||
VLLM_ENFORCE_EAGER=true
|
||||
{% elif default_model == 'codellama-34b' %}
|
||||
# CodeLlama-34B specific optimizations
|
||||
VLLM_USE_XFORMERS=true
|
||||
VLLM_ENABLE_CHUNKED_PREFILL=true
|
||||
{% endif %}
|
||||
302
docs/APPLICATIONS.md
Normal file
302
docs/APPLICATIONS.md
Normal file
@ -0,0 +1,302 @@
|
||||
# Organisation Multi-Projets & Multi-Équipes
|
||||
|
||||
## Structure Proposée
|
||||
|
||||
```
|
||||
ai-infrastructure/
|
||||
├── infrastructure/ # Infrastructure commune (actuelle)
|
||||
│ ├── terraform/
|
||||
│ ├── ansible/
|
||||
│ └── inventories/
|
||||
│
|
||||
├── applications/ # Applications métier par équipe
|
||||
│ ├── team-frontend/
|
||||
│ │ ├── web-app-react/ # Application React
|
||||
│ │ │ ├── src/
|
||||
│ │ │ ├── Dockerfile
|
||||
│ │ │ ├── .gitlab-ci.yml # CI/CD spécifique
|
||||
│ │ │ └── k8s/ # Manifests Kubernetes
|
||||
│ │ └── mobile-app-react-native/
|
||||
│ │
|
||||
│ ├── team-backend/
|
||||
│ │ ├── api-python-fastapi/ # API Python FastAPI
|
||||
│ │ │ ├── app/
|
||||
│ │ │ ├── requirements.txt
|
||||
│ │ │ ├── Dockerfile
|
||||
│ │ │ ├── .gitlab-ci.yml
|
||||
│ │ │ └── k8s/
|
||||
│ │ ├── api-laravel/ # API Laravel
|
||||
│ │ │ ├── app/
|
||||
│ │ │ ├── composer.json
|
||||
│ │ │ ├── Dockerfile
|
||||
│ │ │ └── k8s/
|
||||
│ │ └── microservice-payment/
|
||||
│ │
|
||||
│ ├── team-ai/
|
||||
│ │ ├── model-training/ # Scripts d'entraînement
|
||||
│ │ ├── inference-service/ # Service d'inférence custom
|
||||
│ │ └── data-processing/
|
||||
│ │
|
||||
│ └── team-devops/
|
||||
│ ├── monitoring-dashboards/ # Dashboards custom Grafana
|
||||
│ ├── backup-scripts/
|
||||
│ └── security-tools/
|
||||
│
|
||||
└── deployment/ # Déploiement orchestré
|
||||
├── environments/
|
||||
│ ├── development/
|
||||
│ │ ├── apps-config.yml # Config apps pour dev
|
||||
│ │ └── routing.yml # Routing HAProxy
|
||||
│ ├── staging/
|
||||
│ └── production/
|
||||
│
|
||||
└── scripts/
|
||||
├── deploy-all.sh # Déploiement complet
|
||||
├── deploy-team.sh # Déploiement par équipe
|
||||
└── rollback.sh
|
||||
```
|
||||
|
||||
## Stratégie de Déploiement
|
||||
|
||||
### 1. Infrastructure GPU (Existante)
|
||||
- **Rôle** : Héberger les services d'inférence IA uniquement
|
||||
- **Technologies** : vLLM, modèles LLM
|
||||
- **Serveurs** : GEX44 avec RTX 4000 Ada
|
||||
|
||||
### 2. Applications Web/API
|
||||
- **Rôle** : Services métier classiques (web, API, bases de données)
|
||||
- **Technologies** : React, FastAPI, Laravel, PostgreSQL, Redis
|
||||
- **Serveurs** : Hetzner Cloud (CX31, CX41) + Kubernetes ou Docker Swarm
|
||||
|
||||
### 3. Intégration
|
||||
```yaml
|
||||
# applications/team-frontend/web-app-react/.gitlab-ci.yml
|
||||
stages:
|
||||
- build
|
||||
- test
|
||||
- deploy-dev
|
||||
- deploy-staging
|
||||
- deploy-prod
|
||||
|
||||
variables:
|
||||
IMAGE: registry.gitlab.com/company/web-app-react
|
||||
AI_API_URL_DEV: "http://dev-ai-server:8000"
|
||||
AI_API_URL_PROD: "https://ai-api.company.com"
|
||||
|
||||
build:
|
||||
stage: build
|
||||
script:
|
||||
- docker build -t $IMAGE:$CI_COMMIT_SHA .
|
||||
- docker push $IMAGE:$CI_COMMIT_SHA
|
||||
|
||||
deploy_production:
|
||||
stage: deploy-prod
|
||||
script:
|
||||
- kubectl set image deployment/web-app web-app=$IMAGE:$CI_COMMIT_SHA
|
||||
environment:
|
||||
name: production
|
||||
url: https://app.company.com
|
||||
```
|
||||
|
||||
## Configuration par Environnement
|
||||
|
||||
### Development
|
||||
```yaml
|
||||
# deployment/environments/development/apps-config.yml
|
||||
applications:
|
||||
web-app-react:
|
||||
replicas: 1
|
||||
resources:
|
||||
cpu: "100m"
|
||||
memory: "128Mi"
|
||||
env:
|
||||
AI_API_URL: "http://dev-ai-server:8000"
|
||||
DATABASE_URL: "postgres://dev-db:5432/app"
|
||||
|
||||
api-python-fastapi:
|
||||
replicas: 1
|
||||
resources:
|
||||
cpu: "200m"
|
||||
memory: "256Mi"
|
||||
env:
|
||||
AI_SERVICE_URL: "http://dev-ai-server:8000/v1"
|
||||
REDIS_URL: "redis://dev-redis:6379"
|
||||
```
|
||||
|
||||
### Production
|
||||
```yaml
|
||||
# deployment/environments/production/apps-config.yml
|
||||
applications:
|
||||
web-app-react:
|
||||
replicas: 3
|
||||
resources:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
env:
|
||||
AI_API_URL: "https://ai-api.company.com"
|
||||
DATABASE_URL: "postgres://prod-db:5432/app"
|
||||
|
||||
api-python-fastapi:
|
||||
replicas: 5
|
||||
resources:
|
||||
cpu: "1000m"
|
||||
memory: "1Gi"
|
||||
env:
|
||||
AI_SERVICE_URL: "https://ai-api.company.com/v1"
|
||||
REDIS_URL: "redis://prod-redis:6379"
|
||||
|
||||
api-laravel:
|
||||
replicas: 3
|
||||
resources:
|
||||
cpu: "800m"
|
||||
memory: "768Mi"
|
||||
env:
|
||||
AI_API_ENDPOINT: "https://ai-api.company.com/v1/chat"
|
||||
```
|
||||
|
||||
## Routing HAProxy
|
||||
|
||||
```bash
|
||||
# deployment/environments/production/routing.yml
|
||||
frontend web_frontend
|
||||
bind *:80
|
||||
bind *:443 ssl crt /etc/ssl/certs/company.pem
|
||||
|
||||
# Applications web
|
||||
acl is_web_app hdr(host) -i app.company.com
|
||||
acl is_api_python hdr(host) -i api.company.com
|
||||
acl is_api_laravel hdr(host) -i laravel-api.company.com
|
||||
|
||||
# AI Services (vers GEX44)
|
||||
acl is_ai_api hdr(host) -i ai-api.company.com
|
||||
|
||||
# Routing
|
||||
use_backend web_app_backend if is_web_app
|
||||
use_backend python_api_backend if is_api_python
|
||||
use_backend laravel_api_backend if is_api_laravel
|
||||
use_backend gex44_cluster if is_ai_api
|
||||
|
||||
backend web_app_backend
|
||||
balance roundrobin
|
||||
server web1 k8s-node1:30080 check
|
||||
server web2 k8s-node2:30080 check
|
||||
|
||||
backend python_api_backend
|
||||
balance roundrobin
|
||||
server api1 k8s-node1:30081 check
|
||||
server api2 k8s-node2:30081 check
|
||||
|
||||
backend gex44_cluster
|
||||
balance roundrobin
|
||||
server gex44-1 10.0.1.101:8000 check
|
||||
server gex44-2 10.0.1.102:8000 check
|
||||
server gex44-3 10.0.1.103:8000 check
|
||||
```
|
||||
|
||||
## Scripts de Déploiement
|
||||
|
||||
### Déploiement par Équipe
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# deployment/scripts/deploy-team.sh
|
||||
|
||||
TEAM=$1
|
||||
ENVIRONMENT=$2
|
||||
|
||||
if [ -z "$TEAM" ] || [ -z "$ENVIRONMENT" ]; then
|
||||
echo "Usage: ./deploy-team.sh <team-name> <environment>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Deploying $TEAM applications to $ENVIRONMENT"
|
||||
|
||||
# Build et push toutes les applications de l'équipe
|
||||
for app in applications/$TEAM/*/; do
|
||||
if [ -f "$app/Dockerfile" ]; then
|
||||
echo "📦 Building $(basename $app)..."
|
||||
cd $app
|
||||
docker build -t registry.company.com/$TEAM/$(basename $app):latest .
|
||||
docker push registry.company.com/$TEAM/$(basename $app):latest
|
||||
cd - > /dev/null
|
||||
fi
|
||||
done
|
||||
|
||||
# Déploiement sur Kubernetes
|
||||
kubectl apply -f deployment/environments/$ENVIRONMENT/
|
||||
kubectl set image deployment -l team=$TEAM --all=registry.company.com/$TEAM/*:latest
|
||||
|
||||
echo "✅ Deployment completed for team $TEAM"
|
||||
```
|
||||
|
||||
### Exemple d'Application React
|
||||
```dockerfile
|
||||
# applications/team-frontend/web-app-react/Dockerfile
|
||||
FROM node:18-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
COPY package*.json ./
|
||||
RUN npm ci --only=production
|
||||
|
||||
COPY . .
|
||||
RUN npm run build
|
||||
|
||||
FROM nginx:alpine
|
||||
COPY --from=builder /app/dist /usr/share/nginx/html
|
||||
COPY nginx.conf /etc/nginx/nginx.conf
|
||||
|
||||
EXPOSE 80
|
||||
CMD ["nginx", "-g", "daemon off;"]
|
||||
```
|
||||
|
||||
```javascript
|
||||
// applications/team-frontend/web-app-react/src/services/aiApi.js
|
||||
class AIApiService {
|
||||
constructor() {
|
||||
this.baseUrl = process.env.REACT_APP_AI_API_URL || 'http://localhost:8000';
|
||||
}
|
||||
|
||||
async generateText(prompt, model = 'mixtral-8x7b') {
|
||||
const response = await fetch(`${this.baseUrl}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: model,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
max_tokens: 1000,
|
||||
temperature: 0.7
|
||||
})
|
||||
});
|
||||
|
||||
return response.json();
|
||||
}
|
||||
}
|
||||
|
||||
export default new AIApiService();
|
||||
```
|
||||
|
||||
## Avantages de cette Organisation
|
||||
|
||||
### Séparation des Responsabilités
|
||||
- **Team DevOps** : Infrastructure GPU et orchestration générale
|
||||
- **Team Frontend** : Applications web et mobile
|
||||
- **Team Backend** : APIs et microservices
|
||||
- **Team AI** : Modèles et services d'inférence custom
|
||||
|
||||
### Déploiement Indépendant
|
||||
- Chaque équipe peut déployer ses applications indépendamment
|
||||
- Pipeline CI/CD par application
|
||||
- Rollback granulaire possible
|
||||
|
||||
### Scaling Différencié
|
||||
- **Infrastructure GPU** : Scale selon la charge IA (coûteux)
|
||||
- **Applications Web** : Scale selon le trafic web (moins coûteux)
|
||||
- Optimisation des ressources par type de charge
|
||||
|
||||
### Monitoring Adapté
|
||||
- Métriques GPU pour les services IA
|
||||
- Métriques web classiques pour les applications
|
||||
- Dashboards par équipe dans Grafana
|
||||
|
||||
Cette approche permet de maintenir l'infrastructure GPU spécialisée tout en supportant efficacement un écosystème d'applications diversifiées.
|
||||
406
docs/ARCHITECTURE.md
Normal file
406
docs/ARCHITECTURE.md
Normal file
@ -0,0 +1,406 @@
|
||||
# Infrastructure Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
This document describes the architecture of the AI Infrastructure running on Hetzner Cloud and dedicated servers. The system is designed for high-performance AI inference with cost optimization, automatic scaling, and production-grade reliability.
|
||||
|
||||
## High-Level Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Internet │
|
||||
└─────────────────────┬───────────────────────────────────────────┘
|
||||
│
|
||||
┌───────▼───────┐
|
||||
│ CloudFlare │ (Optional CDN/DDoS protection)
|
||||
│ Proxy │
|
||||
└───────┬───────┘
|
||||
│
|
||||
┌─────────────────────▼───────────────────────────────────────────┐
|
||||
│ Hetzner Cloud │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌──────────────┐ │
|
||||
│ │ HAProxy LB │ │ API Gateway │ │ Monitoring │ │
|
||||
│ │ (cx31) │ │ (cx31) │ │ (cx21) │ │
|
||||
│ │ 8CPU/32GB │ │ 8CPU/32GB │ │ 4CPU/16GB │ │
|
||||
│ │ €22.68/month │ │ €22.68/month │ │ €11.76/mo │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └──────────────┘ │
|
||||
│ │ │ │ │
|
||||
└──────────────┼───────────────────┼──────────────────────┼───────┘
|
||||
│ │ │
|
||||
┌─────▼─────┐ ┌────▼────┐ ┌─────▼─────┐
|
||||
│ │ │ │ │ │
|
||||
│ GEX44 │ │ GEX44 │ │ GEX44 │
|
||||
│ #1 │ │ #2 │ │ #3 │
|
||||
│ │ │ │ │ │
|
||||
│ vLLM API │ │vLLM API │ │ vLLM API │
|
||||
│Mixtral-8x7│ │Llama-70B│ │CodeLlama │
|
||||
│€184/month │ │€184/mo │ │€184/month │
|
||||
└───────────┘ └─────────┘ └───────────┘
|
||||
│ │ │
|
||||
┌────▼────────────────────▼─────────────────────▼────┐
|
||||
│ Hetzner Private Network │
|
||||
│ (10.0.0.0/16 - VXLAN overlay) │
|
||||
└─────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Component Details
|
||||
|
||||
### 1. Load Balancer (HAProxy)
|
||||
|
||||
**Hardware**: Hetzner Cloud cx31 (8 vCPU, 32GB RAM)
|
||||
**Location**: Private IP 10.0.2.10
|
||||
**Role**: Traffic distribution, SSL termination, health checks
|
||||
|
||||
**Features**:
|
||||
- Round-robin load balancing with health checks
|
||||
- SSL/TLS termination with automatic certificate renewal
|
||||
- Statistics dashboard (port 8404)
|
||||
- Request routing based on URL patterns
|
||||
- Rate limiting and DDoS protection
|
||||
- Prometheus metrics export
|
||||
|
||||
**Configuration**:
|
||||
```haproxy
|
||||
backend vllm_backend
|
||||
balance roundrobin
|
||||
option httpchk GET /health
|
||||
server gex44-1 10.0.1.10:8000 check
|
||||
server gex44-2 10.0.1.11:8000 check
|
||||
server gex44-3 10.0.1.12:8000 check
|
||||
```
|
||||
|
||||
### 2. API Gateway (Nginx)
|
||||
|
||||
**Hardware**: Hetzner Cloud cx31 (8 vCPU, 32GB RAM)
|
||||
**Location**: Private IP 10.0.2.11
|
||||
**Role**: API management, authentication, rate limiting
|
||||
|
||||
**Features**:
|
||||
- Request/response transformation
|
||||
- API versioning and routing
|
||||
- Authentication and authorization
|
||||
- Request/response logging
|
||||
- API analytics and metrics
|
||||
- Caching for frequently requested data
|
||||
|
||||
### 3. GPU Servers (GEX44)
|
||||
|
||||
**Hardware per server**:
|
||||
- CPU: Intel i5-13500 (12 cores, 20 threads)
|
||||
- GPU: NVIDIA RTX 4000 Ada Generation (20GB VRAM)
|
||||
- RAM: 64GB DDR4
|
||||
- Storage: 2x 1.92TB NVMe SSD (RAID 1)
|
||||
- Network: 1 Gbit/s
|
||||
|
||||
**Software Stack**:
|
||||
- OS: Ubuntu 22.04 LTS
|
||||
- CUDA: 12.3
|
||||
- Python: 3.11
|
||||
- vLLM: 0.3.0+
|
||||
- Docker: 24.0.5
|
||||
|
||||
**Network Configuration**:
|
||||
- Private IPs: 10.0.1.10, 10.0.1.11, 10.0.1.12
|
||||
- vLLM API: Port 8000
|
||||
- Metrics: Port 9835 (nvidia-smi-exporter)
|
||||
- Node metrics: Port 9100 (node-exporter)
|
||||
|
||||
### 4. Monitoring Stack
|
||||
|
||||
**Hardware**: Hetzner Cloud cx21 (4 vCPU, 16GB RAM)
|
||||
**Location**: Private IP 10.0.2.12
|
||||
|
||||
**Components**:
|
||||
- **Prometheus**: Metrics collection and storage
|
||||
- **Grafana**: Visualization and dashboards
|
||||
- **AlertManager**: Alert routing and notification
|
||||
- **Node Exporter**: System metrics
|
||||
- **nvidia-smi-exporter**: GPU metrics
|
||||
|
||||
## Network Architecture
|
||||
|
||||
### Private Network
|
||||
|
||||
**CIDR**: 10.0.0.0/16
|
||||
**Subnets**:
|
||||
- Cloud servers: 10.0.2.0/24
|
||||
- GEX44 servers: 10.0.1.0/24
|
||||
|
||||
### Security Groups
|
||||
|
||||
1. **SSH Access**: Port 22 (restricted IPs)
|
||||
2. **HTTP/HTTPS**: Ports 80, 443 (public)
|
||||
3. **API Access**: Port 8000 (internal only)
|
||||
4. **Monitoring**: Ports 3000, 9090 (restricted)
|
||||
5. **Internal Communication**: All ports within private network
|
||||
|
||||
### Firewall Rules
|
||||
|
||||
```yaml
|
||||
# Public access
|
||||
- HTTP (80) from 0.0.0.0/0
|
||||
- HTTPS (443) from 0.0.0.0/0
|
||||
|
||||
# Management access (restrict to office IPs)
|
||||
- SSH (22) from office_cidr
|
||||
- Grafana (3000) from office_cidr
|
||||
- Prometheus (9090) from office_cidr
|
||||
|
||||
# Internal communication
|
||||
- All traffic within 10.0.0.0/16
|
||||
```
|
||||
|
||||
## Data Flow
|
||||
|
||||
### Inference Request Flow
|
||||
|
||||
1. **Client** → **Load Balancer** (HAProxy)
|
||||
- SSL termination
|
||||
- Request routing
|
||||
- Health check validation
|
||||
|
||||
2. **Load Balancer** → **GPU Server** (vLLM)
|
||||
- HTTP request to /v1/chat/completions
|
||||
- Model selection and processing
|
||||
- Response generation
|
||||
|
||||
3. **GPU Server** → **Load Balancer** → **Client**
|
||||
- JSON response with completion
|
||||
- Usage metrics included
|
||||
|
||||
### Monitoring Data Flow
|
||||
|
||||
1. **GPU Servers** → **Prometheus**
|
||||
- nvidia-smi metrics (GPU utilization, temperature, memory)
|
||||
- vLLM metrics (requests, latency, tokens)
|
||||
- System metrics (CPU, memory, disk)
|
||||
|
||||
2. **Load Balancer** → **Prometheus**
|
||||
- HAProxy metrics (requests, response times, errors)
|
||||
- Backend server health status
|
||||
|
||||
3. **Prometheus** → **Grafana**
|
||||
- Time-series data visualization
|
||||
- Dashboard rendering
|
||||
- Alert evaluation
|
||||
|
||||
## Storage Architecture
|
||||
|
||||
### Model Storage
|
||||
|
||||
**Location**: Each GEX44 server
|
||||
**Path**: `/opt/vllm/models/`
|
||||
**Size**: ~100GB per model
|
||||
|
||||
**Models Stored**:
|
||||
- Mixtral-8x7B-Instruct (87GB)
|
||||
- Llama-2-70B-Chat (140GB, quantized)
|
||||
- CodeLlama-34B (68GB)
|
||||
|
||||
### Shared Storage
|
||||
|
||||
**Type**: Hetzner Cloud Volume
|
||||
**Size**: 500GB
|
||||
**Mount**: `/mnt/shared`
|
||||
**Purpose**: Configuration, logs, backups
|
||||
|
||||
### Backup Strategy
|
||||
|
||||
**What is backed up**:
|
||||
- Terraform state files
|
||||
- Ansible configurations
|
||||
- Grafana dashboards
|
||||
- Prometheus configuration
|
||||
- Application logs (last 7 days)
|
||||
|
||||
**What is NOT backed up**:
|
||||
- Model files (re-downloadable)
|
||||
- Prometheus metrics (30-day retention)
|
||||
- Large log files (rotated automatically)
|
||||
|
||||
## Scaling Architecture
|
||||
|
||||
### Horizontal Scaling
|
||||
|
||||
**Auto-scaling triggers**:
|
||||
- GPU utilization > 80% for 10 minutes → Scale up
|
||||
- GPU utilization < 30% for 30 minutes → Scale down
|
||||
- Queue depth > 50 requests → Immediate scale up
|
||||
|
||||
**Scaling process**:
|
||||
1. Monitor metrics via Prometheus
|
||||
2. Autoscaler service evaluates conditions
|
||||
3. Order new GEX44 via Robot API
|
||||
4. Ansible configures new server
|
||||
5. Add to load balancer pool
|
||||
|
||||
### Vertical Scaling
|
||||
|
||||
**Model optimization**:
|
||||
- Quantization (AWQ, GPTQ)
|
||||
- Tensor parallelism for large models
|
||||
- Memory optimization techniques
|
||||
|
||||
## High Availability
|
||||
|
||||
### Redundancy
|
||||
|
||||
- **Load Balancer**: Single point (acceptable for cost/benefit)
|
||||
- **GPU Servers**: 3 servers minimum (N+1 redundancy)
|
||||
- **Monitoring**: Single instance with backup configuration
|
||||
|
||||
### Failure Scenarios
|
||||
|
||||
1. **Single GPU server failure**:
|
||||
- Automatic removal from load balancer
|
||||
- 66% capacity maintained
|
||||
- Automatic replacement order
|
||||
|
||||
2. **Load balancer failure**:
|
||||
- Manual failover to backup
|
||||
- DNS change required
|
||||
- ~10 minute downtime
|
||||
|
||||
3. **Network partition**:
|
||||
- Private network redundancy
|
||||
- Automatic retry logic
|
||||
- Graceful degradation
|
||||
|
||||
## Security Architecture
|
||||
|
||||
### Network Security
|
||||
|
||||
- Private network isolation
|
||||
- Firewall rules at multiple levels
|
||||
- No direct internet access to GPU servers
|
||||
- VPN for administrative access
|
||||
|
||||
### Application Security
|
||||
|
||||
- API rate limiting
|
||||
- Request validation
|
||||
- Input sanitization
|
||||
- Output filtering
|
||||
|
||||
### Infrastructure Security
|
||||
|
||||
- SSH key-based authentication
|
||||
- Regular security updates
|
||||
- Intrusion detection
|
||||
- Log monitoring
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
### Latency
|
||||
|
||||
- **P50**: <1.5 seconds
|
||||
- **P95**: <3 seconds
|
||||
- **P99**: <5 seconds
|
||||
|
||||
### Throughput
|
||||
|
||||
- **Total**: ~255 tokens/second (3 servers)
|
||||
- **Per server**: ~85 tokens/second
|
||||
- **Max RPS**: ~50 requests/second
|
||||
|
||||
### Resource Utilization
|
||||
|
||||
- **GPU**: 65-75% average utilization
|
||||
- **CPU**: 30-40% average utilization
|
||||
- **Memory**: 70-80% utilization (model loading)
|
||||
- **Network**: <100 Mbps typical
|
||||
|
||||
## Cost Breakdown
|
||||
|
||||
### Monthly Costs (EUR)
|
||||
|
||||
| Component | Quantity | Unit Cost | Total |
|
||||
|-----------|----------|-----------|--------|
|
||||
| GEX44 Servers | 3 | €184 | €552 |
|
||||
| cx31 (LB) | 1 | €22.68 | €22.68 |
|
||||
| cx31 (API GW) | 1 | €22.68 | €22.68 |
|
||||
| cx21 (Monitor) | 1 | €11.76 | €11.76 |
|
||||
| Storage | 500GB | €0.05/GB | €25 |
|
||||
| **Total** | | | **€634.12** |
|
||||
|
||||
### Cost per Request
|
||||
|
||||
At 100,000 requests/day:
|
||||
- Monthly requests: 3,000,000
|
||||
- Cost per request: €0.0002
|
||||
- Cost per token: €0.0000025
|
||||
|
||||
## Disaster Recovery
|
||||
|
||||
### Backup Procedures
|
||||
|
||||
1. **Daily**: Configuration backup to cloud storage
|
||||
2. **Weekly**: Full system state backup
|
||||
3. **Monthly**: Disaster recovery test
|
||||
|
||||
### Recovery Procedures
|
||||
|
||||
1. **Infrastructure**: Terraform state restoration
|
||||
2. **Configuration**: Ansible playbook execution
|
||||
3. **Models**: Re-download from HuggingFace
|
||||
4. **Data**: Restore from backup storage
|
||||
|
||||
### RTO/RPO Targets
|
||||
|
||||
- **RTO**: 2 hours (Recovery Time Objective)
|
||||
- **RPO**: 24 hours (Recovery Point Objective)
|
||||
|
||||
## Monitoring and Alerting
|
||||
|
||||
### Key Metrics
|
||||
|
||||
**Infrastructure**:
|
||||
- GPU utilization and temperature
|
||||
- Memory usage and availability
|
||||
- Network throughput
|
||||
- Storage usage
|
||||
|
||||
**Application**:
|
||||
- Request rate and latency
|
||||
- Error rate and types
|
||||
- Token generation rate
|
||||
- Queue depth
|
||||
|
||||
**Business**:
|
||||
- Cost per request
|
||||
- Revenue per request
|
||||
- SLA compliance
|
||||
- User satisfaction
|
||||
|
||||
### Alert Levels
|
||||
|
||||
1. **Info**: Cost optimization opportunities
|
||||
2. **Warning**: Performance degradation
|
||||
3. **Critical**: Service outage or severe issues
|
||||
|
||||
## Future Architecture Considerations
|
||||
|
||||
### Planned Improvements
|
||||
|
||||
1. **Multi-region deployment** (Q4 2024)
|
||||
- Nuremberg + Helsinki regions
|
||||
- Cross-region load balancing
|
||||
- Improved latency for global users
|
||||
|
||||
2. **Advanced auto-scaling** (Q1 2025)
|
||||
- Predictive scaling based on usage patterns
|
||||
- Spot instance integration
|
||||
- More sophisticated cost optimization
|
||||
|
||||
3. **Edge deployment** (Q2 2025)
|
||||
- Smaller models at edge locations
|
||||
- Reduced latency for simple requests
|
||||
- Hybrid edge-cloud architecture
|
||||
|
||||
### Technology Evolution
|
||||
|
||||
- **Hardware**: Migration to H100 when cost-effective
|
||||
- **Software**: Continuous optimization of inference stack
|
||||
- **Networking**: 10 Gbit/s upgrade for high-throughput scenarios
|
||||
|
||||
This architecture provides a solid foundation for scaling from thousands to millions of requests per day while maintaining cost efficiency and performance.
|
||||
568
docs/DEPLOYMENT.md
Normal file
568
docs/DEPLOYMENT.md
Normal file
@ -0,0 +1,568 @@
|
||||
# Deployment Guide
|
||||
|
||||
This guide provides step-by-step instructions for deploying the AI Infrastructure on Hetzner Cloud and dedicated servers.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before starting the deployment, ensure you have:
|
||||
|
||||
### Required Accounts and Access
|
||||
|
||||
1. **Hetzner Cloud Account**
|
||||
- API token with read/write permissions
|
||||
- Budget sufficient for cloud resources (~€60/month)
|
||||
|
||||
2. **Hetzner Robot Account**
|
||||
- API credentials for dedicated server management
|
||||
- Budget for GEX44 servers (€184/month each)
|
||||
|
||||
3. **GitLab Account** (for CI/CD)
|
||||
- Project with CI/CD pipelines enabled
|
||||
- Variables configured for secrets
|
||||
|
||||
### Local Development Environment
|
||||
|
||||
```bash
|
||||
# Required tools
|
||||
terraform >= 1.5.0
|
||||
ansible >= 8.0.0
|
||||
kubectl >= 1.28.0 # Optional
|
||||
docker >= 24.0.0
|
||||
python >= 3.11
|
||||
go >= 1.21 # For testing
|
||||
|
||||
# Install tools on Ubuntu/Debian
|
||||
sudo apt update
|
||||
sudo apt install -y software-properties-common
|
||||
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
|
||||
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
|
||||
sudo apt update
|
||||
sudo apt install terraform ansible python3-pip docker.io
|
||||
|
||||
# Install additional tools
|
||||
pip3 install ansible-lint molecule[docker]
|
||||
```
|
||||
|
||||
### SSH Key Setup
|
||||
|
||||
```bash
|
||||
# Generate SSH key for server access
|
||||
ssh-keygen -t rsa -b 4096 -f ~/.ssh/hetzner_key -C "ai-infrastructure"
|
||||
|
||||
# Add to SSH agent
|
||||
ssh-add ~/.ssh/hetzner_key
|
||||
|
||||
# Copy public key content
|
||||
cat ~/.ssh/hetzner_key.pub
|
||||
```
|
||||
|
||||
## Pre-Deployment Checklist
|
||||
|
||||
### 1. Order GEX44 Servers
|
||||
|
||||
**Important**: GEX44 servers must be ordered manually through Hetzner Robot portal or API.
|
||||
|
||||
```bash
|
||||
# Order via Robot API (optional)
|
||||
curl -X POST https://robot-ws.your-server.de/order/server \
|
||||
-H "Authorization: Basic $(echo -n 'username:password' | base64)" \
|
||||
-d "product_id=GEX44&location=FSN1-DC14&os=ubuntu-22.04"
|
||||
```
|
||||
|
||||
**Manual ordering steps**:
|
||||
1. Login to [Robot Console](https://robot.your-server.de/)
|
||||
2. Navigate to "Order" → "Dedicated Servers"
|
||||
3. Select GEX44 configuration:
|
||||
- Location: FSN1-DC14 (Frankfurt)
|
||||
- OS: Ubuntu 22.04 LTS
|
||||
- Quantity: 3 (for production)
|
||||
4. Complete payment and wait for provisioning (2-24 hours)
|
||||
|
||||
### 2. Configure Environment Variables
|
||||
|
||||
Create environment file:
|
||||
|
||||
```bash
|
||||
# Copy example environment file
|
||||
cp .env.example .env
|
||||
|
||||
# Edit with your credentials
|
||||
vim .env
|
||||
```
|
||||
|
||||
Required variables:
|
||||
|
||||
```bash
|
||||
# Hetzner credentials
|
||||
HCLOUD_TOKEN=your_hcloud_token_here
|
||||
ROBOT_API_USER=your_robot_username
|
||||
ROBOT_API_PASSWORD=your_robot_password
|
||||
|
||||
# SSH configuration
|
||||
SSH_PUBLIC_KEY="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQ..."
|
||||
SSH_PRIVATE_KEY_PATH=~/.ssh/hetzner_key
|
||||
|
||||
# Domain configuration (optional)
|
||||
API_DOMAIN=api.yourdomain.com
|
||||
MONITORING_DOMAIN=monitoring.yourdomain.com
|
||||
|
||||
# Monitoring credentials
|
||||
GRAFANA_ADMIN_PASSWORD=secure_password_here
|
||||
|
||||
# GitLab CI/CD
|
||||
GITLAB_TOKEN=your_gitlab_token
|
||||
ANSIBLE_VAULT_PASSWORD=secure_vault_password
|
||||
|
||||
# Cost tracking
|
||||
PROJECT_NAME=ai-infrastructure
|
||||
COST_CENTER=engineering
|
||||
|
||||
# Auto-scaling configuration
|
||||
MIN_GEX44_COUNT=1
|
||||
MAX_GEX44_COUNT=5
|
||||
SCALE_UP_THRESHOLD=0.8
|
||||
SCALE_DOWN_THRESHOLD=0.3
|
||||
```
|
||||
|
||||
### 3. Configure Terraform Backend
|
||||
|
||||
Choose your state backend:
|
||||
|
||||
#### Option A: GitLab Backend (Recommended)
|
||||
|
||||
```hcl
|
||||
# terraform/backend.tf
|
||||
terraform {
|
||||
backend "http" {
|
||||
address = "https://gitlab.com/api/v4/projects/YOUR_PROJECT_ID/terraform/state/ai-infrastructure"
|
||||
lock_address = "https://gitlab.com/api/v4/projects/YOUR_PROJECT_ID/terraform/state/ai-infrastructure/lock"
|
||||
unlock_address = "https://gitlab.com/api/v4/projects/YOUR_PROJECT_ID/terraform/state/ai-infrastructure/lock"
|
||||
username = "your-username"
|
||||
password = "your-access-token"
|
||||
lock_method = "POST"
|
||||
unlock_method = "DELETE"
|
||||
retry_wait_min = 5
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Option B: S3-Compatible Backend
|
||||
|
||||
```hcl
|
||||
# terraform/backend.tf
|
||||
terraform {
|
||||
backend "s3" {
|
||||
bucket = "your-terraform-state-bucket"
|
||||
key = "ai-infrastructure/terraform.tfstate"
|
||||
region = "eu-central-1"
|
||||
encrypt = true
|
||||
dynamodb_table = "terraform-state-lock"
|
||||
shared_credentials_file = "~/.aws/credentials"
|
||||
profile = "default"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Deployment Process
|
||||
|
||||
### Step 1: Initial Setup
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://github.com/yourorg/ai-infrastructure.git
|
||||
cd ai-infrastructure
|
||||
|
||||
# Install dependencies
|
||||
make setup
|
||||
|
||||
# Validate configuration
|
||||
make validate
|
||||
```
|
||||
|
||||
### Step 2: Development Environment
|
||||
|
||||
Start with a development deployment to test the configuration:
|
||||
|
||||
```bash
|
||||
# Deploy development environment
|
||||
make deploy-dev
|
||||
|
||||
# Wait for completion (15-20 minutes)
|
||||
# Check deployment status
|
||||
make status ENV=dev
|
||||
|
||||
# Test the deployment
|
||||
make test ENV=dev
|
||||
```
|
||||
|
||||
### Step 3: Staging Environment
|
||||
|
||||
Once development is working, deploy staging:
|
||||
|
||||
```bash
|
||||
# Plan staging deployment
|
||||
make plan ENV=staging
|
||||
|
||||
# Review the plan carefully
|
||||
# Deploy staging
|
||||
make deploy-staging
|
||||
|
||||
# Run integration tests
|
||||
make test-load API_URL=https://api-staging.yourdomain.com
|
||||
```
|
||||
|
||||
### Step 4: Production Deployment
|
||||
|
||||
**Warning**: Production deployment should be done during maintenance windows.
|
||||
|
||||
```bash
|
||||
# Create backup of current state
|
||||
make backup ENV=production
|
||||
|
||||
# Plan production deployment
|
||||
make plan ENV=production
|
||||
|
||||
# Review plan with team
|
||||
# Get approval for production deployment
|
||||
|
||||
# Deploy production (requires manual confirmation)
|
||||
make deploy-prod
|
||||
|
||||
# Verify deployment
|
||||
make status ENV=production
|
||||
make test ENV=production
|
||||
```
|
||||
|
||||
## Detailed Deployment Steps
|
||||
|
||||
### Infrastructure Deployment (Terraform)
|
||||
|
||||
```bash
|
||||
# Navigate to terraform directory
|
||||
cd terraform/environments/production
|
||||
|
||||
# Initialize Terraform
|
||||
terraform init
|
||||
|
||||
# Create execution plan
|
||||
terraform plan -out=production.tfplan
|
||||
|
||||
# Review the plan
|
||||
terraform show production.tfplan
|
||||
|
||||
# Apply the plan
|
||||
terraform apply production.tfplan
|
||||
```
|
||||
|
||||
Expected resources to be created:
|
||||
- 1x Private network (10.0.0.0/16)
|
||||
- 2x Subnets (cloud and GEX44)
|
||||
- 4x Firewall rules
|
||||
- 3x Cloud servers (LB, API GW, Monitoring)
|
||||
- 1x Volume (500GB)
|
||||
- Various security groups
|
||||
|
||||
### Server Configuration (Ansible)
|
||||
|
||||
```bash
|
||||
# Navigate to ansible directory
|
||||
cd ansible
|
||||
|
||||
# Test connectivity
|
||||
ansible all -i inventory/production.yml -m ping
|
||||
|
||||
# Run full configuration
|
||||
ansible-playbook -i inventory/production.yml playbooks/site.yml
|
||||
|
||||
# Verify services are running
|
||||
ansible all -i inventory/production.yml -a "systemctl status vllm-api"
|
||||
```
|
||||
|
||||
### GEX44 Configuration
|
||||
|
||||
The GEX44 servers require special handling due to their dedicated nature:
|
||||
|
||||
```bash
|
||||
# Configure GEX44 servers specifically
|
||||
ansible-playbook -i inventory/production.yml playbooks/gex44-setup.yml
|
||||
|
||||
# Wait for model downloads (can take 1-2 hours)
|
||||
# Monitor progress
|
||||
ansible gex44 -i inventory/production.yml -a "tail -f /var/log/vllm/model-download.log"
|
||||
|
||||
# Verify GPU accessibility
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi"
|
||||
|
||||
# Test vLLM API
|
||||
ansible gex44 -i inventory/production.yml -a "curl -f http://localhost:8000/health"
|
||||
```
|
||||
|
||||
### Load Balancer Configuration
|
||||
|
||||
```bash
|
||||
# Configure HAProxy load balancer
|
||||
ansible-playbook -i inventory/production.yml playbooks/load-balancer-setup.yml
|
||||
|
||||
# Test load balancer
|
||||
curl -f http://LOAD_BALANCER_IP/health
|
||||
|
||||
# Check HAProxy stats
|
||||
curl http://LOAD_BALANCER_IP:8404/stats
|
||||
```
|
||||
|
||||
### Monitoring Setup
|
||||
|
||||
```bash
|
||||
# Configure monitoring stack
|
||||
ansible-playbook -i inventory/production.yml playbooks/monitoring-setup.yml
|
||||
|
||||
# Access Grafana (after DNS setup)
|
||||
open https://monitoring.yourdomain.com
|
||||
|
||||
# Default credentials:
|
||||
# Username: admin
|
||||
# Password: (from GRAFANA_ADMIN_PASSWORD)
|
||||
```
|
||||
|
||||
## Post-Deployment Configuration
|
||||
|
||||
### 1. DNS Configuration
|
||||
|
||||
Update your DNS records to point to the deployed infrastructure:
|
||||
|
||||
```dns
|
||||
api.yourdomain.com. 300 IN A LOAD_BALANCER_IP
|
||||
monitoring.yourdomain.com. 300 IN A MONITORING_IP
|
||||
*.api.yourdomain.com. 300 IN A LOAD_BALANCER_IP
|
||||
```
|
||||
|
||||
### 2. SSL Certificate Setup
|
||||
|
||||
```bash
|
||||
# Let's Encrypt certificates (automatic)
|
||||
ansible-playbook -i inventory/production.yml playbooks/ssl-setup.yml
|
||||
|
||||
# Or manually with certbot
|
||||
sudo certbot --nginx -d api.yourdomain.com -d monitoring.yourdomain.com
|
||||
```
|
||||
|
||||
### 3. Monitoring Configuration
|
||||
|
||||
#### Grafana Dashboards
|
||||
|
||||
1. Login to Grafana at https://monitoring.yourdomain.com
|
||||
2. Import pre-built dashboards from `monitoring/grafana/dashboards/`
|
||||
3. Configure alert channels (email, Slack, etc.)
|
||||
|
||||
#### Prometheus Alerts
|
||||
|
||||
Alerts are automatically configured, but you may want to customize:
|
||||
|
||||
```bash
|
||||
# Edit alert rules
|
||||
vim monitoring/prometheus/alerts.yml
|
||||
|
||||
# Reload Prometheus configuration
|
||||
ansible monitoring -i inventory/production.yml -a "systemctl reload prometheus"
|
||||
```
|
||||
|
||||
### 4. Backup Configuration
|
||||
|
||||
```bash
|
||||
# Setup automated backups
|
||||
ansible-playbook -i inventory/production.yml playbooks/backup-setup.yml
|
||||
|
||||
# Test backup process
|
||||
make backup ENV=production
|
||||
|
||||
# Verify backup files
|
||||
ls -la backups/$(date +%Y%m%d)/
|
||||
```
|
||||
|
||||
## Validation and Testing
|
||||
|
||||
### Health Checks
|
||||
|
||||
```bash
|
||||
# Infrastructure health
|
||||
make status ENV=production
|
||||
|
||||
# API health
|
||||
curl -f https://api.yourdomain.com/health
|
||||
|
||||
# Monitoring health
|
||||
curl -f https://monitoring.yourdomain.com/api/health
|
||||
```
|
||||
|
||||
### Load Testing
|
||||
|
||||
```bash
|
||||
# Basic load test
|
||||
make test-load API_URL=https://api.yourdomain.com
|
||||
|
||||
# Extended load test
|
||||
k6 run tests/load/k6_inference_test.js --env API_URL=https://api.yourdomain.com
|
||||
```
|
||||
|
||||
### Contract Testing
|
||||
|
||||
```bash
|
||||
# API contract tests
|
||||
python tests/contracts/test_inference_api.py --api-url=https://api.yourdomain.com
|
||||
```
|
||||
|
||||
## Troubleshooting Deployment Issues
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### 1. Terraform State Lock
|
||||
|
||||
```bash
|
||||
# If state is locked
|
||||
terraform force-unlock LOCK_ID
|
||||
|
||||
# Or reset state (dangerous)
|
||||
terraform state pull > backup.tfstate
|
||||
terraform state rm # problematic resource
|
||||
terraform import # re-import resource
|
||||
```
|
||||
|
||||
#### 2. Ansible Connection Issues
|
||||
|
||||
```bash
|
||||
# Test SSH connectivity
|
||||
ansible all -i inventory/production.yml -m ping
|
||||
|
||||
# Check SSH agent
|
||||
ssh-add -l
|
||||
|
||||
# Debug connection
|
||||
ansible all -i inventory/production.yml -m ping -vvv
|
||||
```
|
||||
|
||||
#### 3. GEX44 Not Accessible
|
||||
|
||||
```bash
|
||||
# Check server status in Robot console
|
||||
# Verify network configuration
|
||||
# Ensure servers are in same private network
|
||||
|
||||
# Manual SSH to debug
|
||||
ssh -i ~/.ssh/hetzner_key ubuntu@GEX44_IP
|
||||
```
|
||||
|
||||
#### 4. Model Download Failures
|
||||
|
||||
```bash
|
||||
# Check disk space
|
||||
ansible gex44 -i inventory/production.yml -a "df -h"
|
||||
|
||||
# Check download logs
|
||||
ansible gex44 -i inventory/production.yml -a "tail -f /var/log/vllm/model-download.log"
|
||||
|
||||
# Retry download
|
||||
ansible-playbook -i inventory/production.yml playbooks/gex44-setup.yml --tags=models
|
||||
```
|
||||
|
||||
### Debug Commands
|
||||
|
||||
```bash
|
||||
# Check all service statuses
|
||||
ansible all -i inventory/production.yml -a "systemctl list-units --failed"
|
||||
|
||||
# View logs
|
||||
ansible all -i inventory/production.yml -a "journalctl -u vllm-api -n 50"
|
||||
|
||||
# Check GPU status
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi"
|
||||
|
||||
# Check network connectivity
|
||||
ansible all -i inventory/production.yml -a "ping -c 3 8.8.8.8"
|
||||
```
|
||||
|
||||
## Rollback Procedures
|
||||
|
||||
### Emergency Rollback
|
||||
|
||||
```bash
|
||||
# Stop accepting new traffic
|
||||
# Update load balancer to maintenance mode
|
||||
ansible load_balancers -i inventory/production.yml -a "systemctl stop haproxy"
|
||||
|
||||
# Rollback Terraform changes
|
||||
cd terraform/environments/production
|
||||
terraform plan -destroy -out=rollback.tfplan
|
||||
terraform apply rollback.tfplan
|
||||
|
||||
# Restore from backup
|
||||
make restore BACKUP_DATE=20241201 ENV=production
|
||||
```
|
||||
|
||||
### Gradual Rollback
|
||||
|
||||
```bash
|
||||
# Remove problematic servers from load balancer
|
||||
# Update HAProxy configuration to exclude failed servers
|
||||
ansible-playbook -i inventory/production.yml playbooks/load-balancer-setup.yml --extra-vars="exclude_servers=['gex44-3']"
|
||||
|
||||
# Fix issues on excluded servers
|
||||
# Re-add to load balancer when ready
|
||||
```
|
||||
|
||||
## Maintenance Procedures
|
||||
|
||||
### Regular Maintenance
|
||||
|
||||
```bash
|
||||
# Weekly: Update all packages
|
||||
ansible all -i inventory/production.yml -a "apt update && apt upgrade -y"
|
||||
|
||||
# Monthly: Restart services
|
||||
ansible all -i inventory/production.yml -a "systemctl restart vllm-api"
|
||||
|
||||
# Quarterly: Full system reboot (during maintenance window)
|
||||
ansible all -i inventory/production.yml -a "reboot" --become
|
||||
```
|
||||
|
||||
### Cost Optimization
|
||||
|
||||
```bash
|
||||
# Generate cost report
|
||||
make cost-report ENV=production
|
||||
|
||||
# Review unused resources
|
||||
python scripts/cost-analysis.py --find-unused
|
||||
|
||||
# Implement recommendations
|
||||
# Scale down during low usage periods
|
||||
```
|
||||
|
||||
## Security Hardening
|
||||
|
||||
### Post-Deployment Security
|
||||
|
||||
```bash
|
||||
# Run security hardening playbook
|
||||
ansible-playbook -i inventory/production.yml playbooks/security-hardening.yml
|
||||
|
||||
# Update firewall rules
|
||||
ansible-playbook -i inventory/production.yml playbooks/firewall-setup.yml
|
||||
|
||||
# Rotate SSH keys
|
||||
ansible-playbook -i inventory/production.yml playbooks/ssh-key-rotation.yml
|
||||
```
|
||||
|
||||
### Security Monitoring
|
||||
|
||||
```bash
|
||||
# Enable fail2ban
|
||||
ansible all -i inventory/production.yml -a "systemctl enable fail2ban"
|
||||
|
||||
# Setup log monitoring
|
||||
ansible-playbook -i inventory/production.yml playbooks/log-monitoring.yml
|
||||
|
||||
# Configure intrusion detection
|
||||
ansible-playbook -i inventory/production.yml playbooks/ids-setup.yml
|
||||
```
|
||||
|
||||
This deployment guide provides a comprehensive path from initial setup to production deployment. Always test changes in development and staging environments before applying to production.
|
||||
103
docs/README.md
Normal file
103
docs/README.md
Normal file
@ -0,0 +1,103 @@
|
||||
# AI Infrastructure Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
Documentation complète de l'infrastructure IA basée sur Hetzner GEX44 pour déploiement multi-environnement avec Terraform, Ansible, et GitLab CI/CD.
|
||||
|
||||
## Architecture
|
||||
|
||||
- **Environnements** : Development, Staging, Production
|
||||
- **Plateforme** : Hetzner Cloud + Serveurs Dédiés GEX44
|
||||
- **OS** : Ubuntu 24.04 LTS sur tous les serveurs
|
||||
- **GPU** : NVIDIA RTX 4000 Ada Generation (20GB VRAM)
|
||||
- **Container Runtime** : Docker 24.0.x
|
||||
- **Orchestration** : Terraform + Ansible
|
||||
- **CI/CD** : GitLab Pipeline
|
||||
|
||||
## Quick Links
|
||||
|
||||
- [🔧 Tools & Technologies](./tools.md) - Liste complète des outils utilisés
|
||||
- [🏗️ Infrastructure](./infrastructure.md) - Architecture détaillée
|
||||
- [🚀 Deployment](./deployment.md) - Guide de déploiement
|
||||
- [📊 Monitoring](./monitoring.md) - Monitoring et observabilité
|
||||
- [🔒 Security](./security.md) - Configuration sécurité
|
||||
- [💰 Costs](./costs.md) - Analyse des coûts
|
||||
|
||||
## Structure du Projet
|
||||
|
||||
```
|
||||
.
|
||||
├── inventories/ # Configuration par environnement
|
||||
│ ├── development/ # Environnement dev
|
||||
│ ├── staging/ # Environnement staging
|
||||
│ ├── production/ # Environnement production
|
||||
│ └── generate_inventory.py # Générateur d'inventaire Ansible
|
||||
├── terraform/ # Infrastructure as Code
|
||||
│ ├── environments/ # Configuration par environnement
|
||||
│ └── modules/ # Modules réutilisables
|
||||
├── ansible/ # Configuration Management
|
||||
│ ├── roles/ # Rôles Ansible
|
||||
│ ├── playbooks/ # Playbooks
|
||||
│ └── group_vars/ # Variables par environnement
|
||||
├── scripts/ # Scripts d'automatisation
|
||||
├── monitoring/ # Configuration monitoring
|
||||
└── docs/ # Documentation
|
||||
```
|
||||
|
||||
## Coûts par Environnement
|
||||
|
||||
| Environnement | Serveurs | Coût/mois | Description |
|
||||
|---------------|----------|-----------|-------------|
|
||||
| **Development** | 1x CX31 (CPU-only) | 23€ | Simulation GPU, tests dev |
|
||||
| **Staging** | 1x GEX44 + 2x Cloud | 206€ | Validation complète |
|
||||
| **Production** | 3x GEX44 + 3x Cloud | 609€ | Haute disponibilité |
|
||||
| **Total** | | **838€** | vs 15,840€ cloud équivalent |
|
||||
|
||||
## Getting Started
|
||||
|
||||
### 1. Prérequis
|
||||
|
||||
```bash
|
||||
# Outils requis
|
||||
terraform >= 1.12
|
||||
ansible >= 8.0
|
||||
python >= 3.12
|
||||
docker >= 24.0
|
||||
```
|
||||
|
||||
### 2. Configuration Initial
|
||||
|
||||
```bash
|
||||
# Clone du projet
|
||||
git clone <repository>
|
||||
cd ai-infrastructure-hetzner
|
||||
|
||||
# Configuration des variables d'environnement
|
||||
cp .env.example .env
|
||||
# Éditer .env avec vos tokens Hetzner
|
||||
|
||||
# Installation des dépendances Python
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 3. Déploiement
|
||||
|
||||
```bash
|
||||
# Déploiement development
|
||||
cd terraform/environments/development
|
||||
terraform init && terraform apply
|
||||
|
||||
# Génération de l'inventaire Ansible
|
||||
cd ../../../inventories
|
||||
python3 generate_inventory.py development
|
||||
|
||||
# Configuration avec Ansible
|
||||
cd ../ansible
|
||||
ansible-playbook -i inventories/development/hosts.yml site.yml
|
||||
```
|
||||
|
||||
## Support
|
||||
|
||||
- **Issues** : Utiliser le système de tickets du projet
|
||||
- **Documentation** : Voir dossier `docs/`
|
||||
- **Monitoring** : Grafana accessible via les URLs d'environnement
|
||||
659
docs/TROUBLESHOOTING.md
Normal file
659
docs/TROUBLESHOOTING.md
Normal file
@ -0,0 +1,659 @@
|
||||
# Troubleshooting Guide
|
||||
|
||||
This guide helps diagnose and resolve common issues with the AI Infrastructure deployment.
|
||||
|
||||
## Quick Diagnostic Commands
|
||||
|
||||
```bash
|
||||
# Overall system health
|
||||
make status ENV=production
|
||||
|
||||
# Check all services
|
||||
ansible all -i inventory/production.yml -a "systemctl list-units --failed"
|
||||
|
||||
# View recent logs
|
||||
ansible all -i inventory/production.yml -a "journalctl --since '10 minutes ago' --no-pager"
|
||||
|
||||
# Check GPU status
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi"
|
||||
|
||||
# Test API endpoints
|
||||
curl -f https://api.yourdomain.com/health
|
||||
curl -f https://api.yourdomain.com/v1/models
|
||||
```
|
||||
|
||||
## Infrastructure Issues
|
||||
|
||||
### Server Not Responding
|
||||
|
||||
**Symptoms**: Server unreachable via SSH or API
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check server status in Hetzner Console
|
||||
# Ping test
|
||||
ping SERVER_IP
|
||||
|
||||
# SSH connectivity test
|
||||
ssh -v -i ~/.ssh/hetzner_key ubuntu@SERVER_IP
|
||||
|
||||
# Check from other servers
|
||||
ansible other_servers -i inventory/production.yml -a "ping -c 3 SERVER_IP"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Network Issues**:
|
||||
```bash
|
||||
# Restart networking
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "systemctl restart networking"
|
||||
|
||||
# Check firewall status
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "ufw status"
|
||||
|
||||
# Reset firewall if needed
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "ufw --force reset"
|
||||
```
|
||||
|
||||
2. **Server Overload**:
|
||||
```bash
|
||||
# Check resource usage
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "top -bn1 | head -20"
|
||||
|
||||
# Check disk space
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "df -h"
|
||||
|
||||
# Check memory
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "free -h"
|
||||
```
|
||||
|
||||
3. **Hardware Issues**:
|
||||
- Contact Hetzner support
|
||||
- Check Robot console for hardware alerts
|
||||
- Consider server replacement
|
||||
|
||||
### Private Network Issues
|
||||
|
||||
**Symptoms**: Servers can't communicate over private network
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check private network configuration
|
||||
ansible all -i inventory/production.yml -a "ip route show"
|
||||
|
||||
# Test private network connectivity
|
||||
ansible all -i inventory/production.yml -a "ping -c 3 10.0.2.10"
|
||||
|
||||
# Check network interfaces
|
||||
ansible all -i inventory/production.yml -a "ip addr show"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
```bash
|
||||
# Restart network interfaces
|
||||
ansible all -i inventory/production.yml -a "systemctl restart networking"
|
||||
|
||||
# Re-apply network configuration
|
||||
ansible-playbook -i inventory/production.yml playbooks/network-setup.yml
|
||||
|
||||
# Check Hetzner Cloud network status
|
||||
terraform show | grep network
|
||||
```
|
||||
|
||||
## GPU Issues
|
||||
|
||||
### GPU Not Detected
|
||||
|
||||
**Symptoms**: `nvidia-smi` command fails or shows no GPUs
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check GPU hardware detection
|
||||
ansible gex44 -i inventory/production.yml -a "lspci | grep -i nvidia"
|
||||
|
||||
# Check NVIDIA driver status
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi"
|
||||
|
||||
# Check driver version
|
||||
ansible gex44 -i inventory/production.yml -a "cat /proc/driver/nvidia/version"
|
||||
|
||||
# Check kernel modules
|
||||
ansible gex44 -i inventory/production.yml -a "lsmod | grep nvidia"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Driver Issues**:
|
||||
```bash
|
||||
# Reinstall NVIDIA drivers
|
||||
ansible-playbook -i inventory/production.yml playbooks/gex44-setup.yml --tags=cuda
|
||||
|
||||
# Reboot after driver installation
|
||||
ansible gex44 -i inventory/production.yml -a "reboot" --become
|
||||
```
|
||||
|
||||
2. **Hardware Issues**:
|
||||
```bash
|
||||
# Check hardware detection
|
||||
ansible gex44 -i inventory/production.yml -a "lshw -C display"
|
||||
|
||||
# Check BIOS settings (requires physical access)
|
||||
# Contact Hetzner support for hardware issues
|
||||
```
|
||||
|
||||
### GPU Memory Issues
|
||||
|
||||
**Symptoms**: CUDA out of memory errors, poor performance
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check GPU memory usage
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi --query-gpu=memory.used,memory.total --format=csv"
|
||||
|
||||
# Check running processes on GPU
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi pmon"
|
||||
|
||||
# Check vLLM memory configuration
|
||||
ansible gex44 -i inventory/production.yml -a "cat /etc/vllm/config.env | grep MEMORY"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Reduce Memory Usage**:
|
||||
```bash
|
||||
# Lower GPU memory utilization
|
||||
ansible gex44 -i inventory/production.yml -m lineinfile -a "path=/etc/vllm/config.env line='VLLM_GPU_MEMORY_UTILIZATION=0.8' regexp='^VLLM_GPU_MEMORY_UTILIZATION='"
|
||||
|
||||
# Restart vLLM
|
||||
ansible gex44 -i inventory/production.yml -a "systemctl restart vllm-api"
|
||||
```
|
||||
|
||||
2. **Clear GPU Memory**:
|
||||
```bash
|
||||
# Kill all GPU processes
|
||||
ansible gex44 -i inventory/production.yml -a "pkill -f python"
|
||||
|
||||
# Reset GPU
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi --gpu-reset"
|
||||
```
|
||||
|
||||
### GPU Temperature Issues
|
||||
|
||||
**Symptoms**: High GPU temperatures, thermal throttling
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check current temperatures
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi --query-gpu=temperature.gpu,temperature.memory --format=csv"
|
||||
|
||||
# Check temperature history in Grafana
|
||||
# Navigate to GPU Metrics dashboard
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Immediate Cooling**:
|
||||
```bash
|
||||
# Reduce GPU workload
|
||||
# Scale down inference requests temporarily
|
||||
|
||||
# Check cooling system
|
||||
ansible gex44 -i inventory/production.yml -a "sensors"
|
||||
```
|
||||
|
||||
2. **Long-term Solutions**:
|
||||
- Contact Hetzner for datacenter cooling issues
|
||||
- Reduce GPU utilization limits
|
||||
- Implement better load balancing
|
||||
|
||||
## vLLM Service Issues
|
||||
|
||||
### vLLM Service Won't Start
|
||||
|
||||
**Symptoms**: `systemctl status vllm-api` shows failed state
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check service status
|
||||
ansible gex44 -i inventory/production.yml -a "systemctl status vllm-api"
|
||||
|
||||
# Check service logs
|
||||
ansible gex44 -i inventory/production.yml -a "journalctl -u vllm-api -n 50"
|
||||
|
||||
# Check vLLM configuration
|
||||
ansible gex44 -i inventory/production.yml -a "cat /etc/vllm/config.env"
|
||||
|
||||
# Test manual start
|
||||
ansible gex44 -i inventory/production.yml -a "sudo -u vllm python -m vllm.entrypoints.openai.api_server --help"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Configuration Issues**:
|
||||
```bash
|
||||
# Validate configuration
|
||||
ansible-playbook -i inventory/production.yml playbooks/gex44-setup.yml --tags=config --check
|
||||
|
||||
# Regenerate configuration
|
||||
ansible-playbook -i inventory/production.yml playbooks/gex44-setup.yml --tags=config
|
||||
```
|
||||
|
||||
2. **Permission Issues**:
|
||||
```bash
|
||||
# Fix file permissions
|
||||
ansible gex44 -i inventory/production.yml -a "chown -R vllm:vllm /opt/vllm"
|
||||
ansible gex44 -i inventory/production.yml -a "chmod 755 /opt/vllm"
|
||||
```
|
||||
|
||||
3. **Dependency Issues**:
|
||||
```bash
|
||||
# Reinstall vLLM
|
||||
ansible gex44 -i inventory/production.yml -a "pip install --force-reinstall vllm"
|
||||
```
|
||||
|
||||
### Model Loading Issues
|
||||
|
||||
**Symptoms**: vLLM starts but models fail to load
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check model files
|
||||
ansible gex44 -i inventory/production.yml -a "ls -la /opt/vllm/models/"
|
||||
|
||||
# Check disk space
|
||||
ansible gex44 -i inventory/production.yml -a "df -h /opt/vllm/models/"
|
||||
|
||||
# Check model loading logs
|
||||
ansible gex44 -i inventory/production.yml -a "tail -f /var/log/vllm/model-loading.log"
|
||||
|
||||
# Test model access
|
||||
ansible gex44 -i inventory/production.yml -a "sudo -u vllm python -c \"from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('/opt/vllm/models/mixtral-8x7b')\""
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Missing Models**:
|
||||
```bash
|
||||
# Re-download models
|
||||
ansible-playbook -i inventory/production.yml playbooks/gex44-setup.yml --tags=models
|
||||
|
||||
# Check HuggingFace connectivity
|
||||
ansible gex44 -i inventory/production.yml -a "curl -f https://huggingface.co"
|
||||
```
|
||||
|
||||
2. **Corrupted Models**:
|
||||
```bash
|
||||
# Remove corrupted models
|
||||
ansible gex44 -i inventory/production.yml -a "rm -rf /opt/vllm/models/mixtral-8x7b"
|
||||
|
||||
# Re-download
|
||||
ansible-playbook -i inventory/production.yml playbooks/gex44-setup.yml --tags=models
|
||||
```
|
||||
|
||||
3. **Insufficient Resources**:
|
||||
```bash
|
||||
# Use smaller model or quantization
|
||||
# Update configuration to use quantized models
|
||||
ansible gex44 -i inventory/production.yml -m lineinfile -a "path=/etc/vllm/config.env line='VLLM_QUANTIZATION=awq' regexp='^VLLM_QUANTIZATION='"
|
||||
```
|
||||
|
||||
### High Latency Issues
|
||||
|
||||
**Symptoms**: API responses take too long
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check current latency
|
||||
curl -w "@curl-format.txt" -o /dev/null -s https://api.yourdomain.com/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model":"mixtral-8x7b","messages":[{"role":"user","content":"Hello"}],"max_tokens":10}'
|
||||
|
||||
# Check queue size
|
||||
curl -s https://api.yourdomain.com/metrics | grep vllm_queue_size
|
||||
|
||||
# Check GPU utilization
|
||||
ansible gex44 -i inventory/production.yml -a "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Scale Up**:
|
||||
```bash
|
||||
# Add more GPU servers
|
||||
make scale-up ENV=production
|
||||
|
||||
# Or manually order new servers
|
||||
python scripts/autoscaler.py --action=scale-up --count=1
|
||||
```
|
||||
|
||||
2. **Optimize Configuration**:
|
||||
```bash
|
||||
# Reduce model precision
|
||||
ansible gex44 -i inventory/production.yml -m lineinfile -a "path=/etc/vllm/config.env line='VLLM_DTYPE=float16' regexp='^VLLM_DTYPE='"
|
||||
|
||||
# Increase batch size
|
||||
ansible gex44 -i inventory/production.yml -m lineinfile -a "path=/etc/vllm/config.env line='VLLM_MAX_NUM_SEQS=512' regexp='^VLLM_MAX_NUM_SEQS='"
|
||||
```
|
||||
|
||||
3. **Load Balancing**:
|
||||
```bash
|
||||
# Check load balancer configuration
|
||||
ansible load_balancers -i inventory/production.yml -a "curl -s http://localhost:8404/stats"
|
||||
|
||||
# Verify all backends are healthy
|
||||
curl -s http://LOAD_BALANCER_IP:8404/stats | grep UP
|
||||
```
|
||||
|
||||
## Load Balancer Issues
|
||||
|
||||
### Load Balancer Not Routing Traffic
|
||||
|
||||
**Symptoms**: Requests fail to reach backend servers
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check HAProxy status
|
||||
ansible load_balancers -i inventory/production.yml -a "systemctl status haproxy"
|
||||
|
||||
# Check HAProxy configuration
|
||||
ansible load_balancers -i inventory/production.yml -a "haproxy -f /etc/haproxy/haproxy.cfg -c"
|
||||
|
||||
# Check backend health
|
||||
curl -s http://LOAD_BALANCER_IP:8404/stats
|
||||
|
||||
# Test direct backend access
|
||||
curl -f http://10.0.1.10:8000/health
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Configuration Issues**:
|
||||
```bash
|
||||
# Regenerate HAProxy configuration
|
||||
ansible-playbook -i inventory/production.yml playbooks/load-balancer-setup.yml
|
||||
|
||||
# Restart HAProxy
|
||||
ansible load_balancers -i inventory/production.yml -a "systemctl restart haproxy"
|
||||
```
|
||||
|
||||
2. **Backend Health Issues**:
|
||||
```bash
|
||||
# Check why backends are failing health checks
|
||||
ansible gex44 -i inventory/production.yml -a "curl -f http://localhost:8000/health"
|
||||
|
||||
# Fix unhealthy backends
|
||||
ansible gex44 -i inventory/production.yml -a "systemctl restart vllm-api"
|
||||
```
|
||||
|
||||
### SSL Certificate Issues
|
||||
|
||||
**Symptoms**: HTTPS requests fail with certificate errors
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check certificate validity
|
||||
openssl s_client -connect api.yourdomain.com:443 -servername api.yourdomain.com
|
||||
|
||||
# Check certificate files
|
||||
ansible load_balancers -i inventory/production.yml -a "ls -la /etc/ssl/certs/"
|
||||
|
||||
# Check certificate expiration
|
||||
ansible load_balancers -i inventory/production.yml -a "openssl x509 -in /etc/ssl/certs/haproxy.pem -text -noout | grep 'Not After'"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Renew Certificates**:
|
||||
```bash
|
||||
# Renew Let's Encrypt certificates
|
||||
ansible load_balancers -i inventory/production.yml -a "certbot renew"
|
||||
|
||||
# Reload HAProxy
|
||||
ansible load_balancers -i inventory/production.yml -a "systemctl reload haproxy"
|
||||
```
|
||||
|
||||
2. **Fix Certificate Configuration**:
|
||||
```bash
|
||||
# Regenerate certificate bundle
|
||||
ansible load_balancers -i inventory/production.yml -a "cat /etc/letsencrypt/live/api.yourdomain.com/fullchain.pem /etc/letsencrypt/live/api.yourdomain.com/privkey.pem > /etc/ssl/certs/haproxy.pem"
|
||||
```
|
||||
|
||||
## Monitoring Issues
|
||||
|
||||
### Prometheus Not Collecting Metrics
|
||||
|
||||
**Symptoms**: Missing data in Grafana dashboards
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check Prometheus status
|
||||
ansible monitoring -i inventory/production.yml -a "systemctl status prometheus"
|
||||
|
||||
# Check Prometheus configuration
|
||||
ansible monitoring -i inventory/production.yml -a "promtool check config /etc/prometheus/prometheus.yml"
|
||||
|
||||
# Check target status
|
||||
curl -s http://MONITORING_IP:9090/api/v1/targets | jq .
|
||||
|
||||
# Test metric endpoints
|
||||
curl -s http://10.0.1.10:9835/metrics | head -10
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Configuration Issues**:
|
||||
```bash
|
||||
# Regenerate Prometheus configuration
|
||||
ansible-playbook -i inventory/production.yml playbooks/monitoring-setup.yml --tags=prometheus
|
||||
|
||||
# Restart Prometheus
|
||||
ansible monitoring -i inventory/production.yml -a "systemctl restart prometheus"
|
||||
```
|
||||
|
||||
2. **Target Connectivity**:
|
||||
```bash
|
||||
# Check network connectivity to targets
|
||||
ansible monitoring -i inventory/production.yml -a "curl -f http://10.0.1.10:9835/metrics"
|
||||
|
||||
# Check firewall rules
|
||||
ansible gex44 -i inventory/production.yml -a "ufw status | grep 9835"
|
||||
```
|
||||
|
||||
### Grafana Dashboard Issues
|
||||
|
||||
**Symptoms**: Dashboards show no data or errors
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check Grafana status
|
||||
ansible monitoring -i inventory/production.yml -a "systemctl status grafana-server"
|
||||
|
||||
# Check Grafana logs
|
||||
ansible monitoring -i inventory/production.yml -a "journalctl -u grafana-server -n 50"
|
||||
|
||||
# Test Prometheus data source
|
||||
curl -s http://MONITORING_IP:3000/api/datasources
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Data Source Issues**:
|
||||
```bash
|
||||
# Reconfigure Grafana data sources
|
||||
ansible-playbook -i inventory/production.yml playbooks/monitoring-setup.yml --tags=grafana
|
||||
|
||||
# Restart Grafana
|
||||
ansible monitoring -i inventory/production.yml -a "systemctl restart grafana-server"
|
||||
```
|
||||
|
||||
2. **Dashboard Import Issues**:
|
||||
```bash
|
||||
# Re-import dashboards
|
||||
ansible-playbook -i inventory/production.yml playbooks/monitoring-setup.yml --tags=dashboards
|
||||
```
|
||||
|
||||
## Performance Issues
|
||||
|
||||
### High CPU Usage
|
||||
|
||||
**Symptoms**: Server becomes slow, high load average
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check CPU usage
|
||||
ansible all -i inventory/production.yml -a "top -bn1 | head -20"
|
||||
|
||||
# Check process list
|
||||
ansible all -i inventory/production.yml -a "ps aux --sort=-%cpu | head -10"
|
||||
|
||||
# Check load average
|
||||
ansible all -i inventory/production.yml -a "uptime"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Identify Resource-Heavy Processes**:
|
||||
```bash
|
||||
# Kill problematic processes
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "pkill -f PROCESS_NAME"
|
||||
|
||||
# Restart services
|
||||
ansible TARGET_SERVER -i inventory/production.yml -a "systemctl restart SERVICE_NAME"
|
||||
```
|
||||
|
||||
2. **Scale Resources**:
|
||||
```bash
|
||||
# Add more servers or upgrade existing ones
|
||||
# Consider upgrading cloud server types in Terraform
|
||||
```
|
||||
|
||||
### High Memory Usage
|
||||
|
||||
**Symptoms**: Out of memory errors, swap usage
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check memory usage
|
||||
ansible all -i inventory/production.yml -a "free -h"
|
||||
|
||||
# Check swap usage
|
||||
ansible all -i inventory/production.yml -a "swapon --show"
|
||||
|
||||
# Check memory-heavy processes
|
||||
ansible all -i inventory/production.yml -a "ps aux --sort=-%mem | head -10"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Free Memory**:
|
||||
```bash
|
||||
# Clear caches
|
||||
ansible all -i inventory/production.yml -a "sync && echo 3 > /proc/sys/vm/drop_caches"
|
||||
|
||||
# Restart memory-heavy services
|
||||
ansible gex44 -i inventory/production.yml -a "systemctl restart vllm-api"
|
||||
```
|
||||
|
||||
2. **Optimize Configuration**:
|
||||
```bash
|
||||
# Reduce model cache size
|
||||
ansible gex44 -i inventory/production.yml -m lineinfile -a "path=/etc/vllm/config.env line='VLLM_SWAP_SPACE=2' regexp='^VLLM_SWAP_SPACE='"
|
||||
```
|
||||
|
||||
## Network Issues
|
||||
|
||||
### High Latency Between Servers
|
||||
|
||||
**Symptoms**: Slow inter-server communication
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Test latency between servers
|
||||
ansible all -i inventory/production.yml -a "ping -c 10 10.0.1.10"
|
||||
|
||||
# Check network interface statistics
|
||||
ansible all -i inventory/production.yml -a "cat /proc/net/dev"
|
||||
|
||||
# Test bandwidth
|
||||
ansible all -i inventory/production.yml -a "iperf3 -c 10.0.1.10 -t 10"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
1. **Network Optimization**:
|
||||
```bash
|
||||
# Optimize network settings
|
||||
ansible-playbook -i inventory/production.yml playbooks/network-optimization.yml
|
||||
|
||||
# Check for network congestion
|
||||
# Consider upgrading network interfaces
|
||||
```
|
||||
|
||||
### DNS Resolution Issues
|
||||
|
||||
**Symptoms**: Domain names not resolving correctly
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Test DNS resolution
|
||||
ansible all -i inventory/production.yml -a "nslookup api.yourdomain.com"
|
||||
|
||||
# Check DNS configuration
|
||||
ansible all -i inventory/production.yml -a "cat /etc/resolv.conf"
|
||||
|
||||
# Test external DNS
|
||||
ansible all -i inventory/production.yml -a "nslookup google.com 8.8.8.8"
|
||||
```
|
||||
|
||||
**Solutions**:
|
||||
```bash
|
||||
# Update DNS configuration
|
||||
ansible all -i inventory/production.yml -m lineinfile -a "path=/etc/resolv.conf line='nameserver 8.8.8.8'"
|
||||
|
||||
# Restart networking
|
||||
ansible all -i inventory/production.yml -a "systemctl restart systemd-resolved"
|
||||
```
|
||||
|
||||
## Emergency Procedures
|
||||
|
||||
### Complete Service Outage
|
||||
|
||||
1. **Immediate Response**:
|
||||
```bash
|
||||
# Check all critical services
|
||||
make status ENV=production
|
||||
|
||||
# Enable maintenance mode
|
||||
ansible load_balancers -i inventory/production.yml -a "systemctl stop haproxy"
|
||||
|
||||
# Notify stakeholders
|
||||
```
|
||||
|
||||
2. **Diagnosis**:
|
||||
```bash
|
||||
# Check recent changes
|
||||
git log --since="2 hours ago" --oneline
|
||||
|
||||
# Check system logs
|
||||
ansible all -i inventory/production.yml -a "journalctl --since '2 hours ago' --no-pager"
|
||||
|
||||
# Check monitoring alerts
|
||||
curl -s http://MONITORING_IP:9090/api/v1/alerts
|
||||
```
|
||||
|
||||
3. **Recovery**:
|
||||
```bash
|
||||
# Rollback recent changes if necessary
|
||||
make rollback ENV=production BACKUP_DATE=YYYYMMDD
|
||||
|
||||
# Or restart all services
|
||||
ansible all -i inventory/production.yml -a "systemctl restart vllm-api haproxy prometheus grafana-server"
|
||||
|
||||
# Re-enable load balancer
|
||||
ansible load_balancers -i inventory/production.yml -a "systemctl start haproxy"
|
||||
```
|
||||
|
||||
### Data Loss Prevention
|
||||
|
||||
```bash
|
||||
# Immediate backup
|
||||
make backup ENV=production
|
||||
|
||||
# Snapshot critical volumes
|
||||
# Use Hetzner Cloud console to create snapshots
|
||||
|
||||
# Document the incident
|
||||
# Create incident report with timeline and actions taken
|
||||
```
|
||||
|
||||
For issues not covered in this guide, contact the infrastructure team or create an issue in the project repository with:
|
||||
- Detailed problem description
|
||||
- Error messages and logs
|
||||
- Steps already taken
|
||||
- Current system status
|
||||
227
docs/deployment.md
Normal file
227
docs/deployment.md
Normal file
@ -0,0 +1,227 @@
|
||||
# Deployment Guide
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prérequis
|
||||
- Ubuntu 24.04 sur tous les serveurs
|
||||
- Terraform 1.12+
|
||||
- Ansible 8.0+
|
||||
- Python 3.12+
|
||||
- Accès API Hetzner Cloud + Robot
|
||||
|
||||
### Déploiement Development
|
||||
|
||||
```bash
|
||||
# 1. Configuration initiale
|
||||
git clone <repository>
|
||||
cd ai-infrastructure-hetzner
|
||||
|
||||
# 2. Variables d'environnement
|
||||
export HCLOUD_TOKEN="your-hetzner-cloud-token"
|
||||
export HETZNER_ROBOT_USER="your-robot-username"
|
||||
export HETZNER_ROBOT_PASSWORD="your-robot-password"
|
||||
|
||||
# 3. Terraform Development
|
||||
cd terraform/environments/development
|
||||
terraform init
|
||||
terraform plan -var-file="dev.tfvars"
|
||||
terraform apply -var-file="dev.tfvars"
|
||||
|
||||
# 4. Génération inventaire Ansible
|
||||
cd ../../../inventories
|
||||
python3 generate_inventory.py development
|
||||
|
||||
# 5. Configuration serveurs
|
||||
cd ../ansible
|
||||
ansible-playbook -i inventories/development/hosts.yml site.yml --limit development
|
||||
```
|
||||
|
||||
### Structure des Fichiers
|
||||
|
||||
```
|
||||
inventories/
|
||||
├── development/
|
||||
│ ├── requirements.yml # Besoins métier dev
|
||||
│ ├── hosts.yml # Généré automatiquement
|
||||
│ └── ssh_config # Config SSH générée
|
||||
├── staging/
|
||||
│ ├── requirements.yml # Besoins métier staging
|
||||
│ └── ...
|
||||
├── production/
|
||||
│ ├── requirements.yml # Besoins métier production
|
||||
│ └── ...
|
||||
└── generate_inventory.py # Générateur d'inventaire
|
||||
```
|
||||
|
||||
## Workflow de Déploiement
|
||||
|
||||
### Development → Staging → Production
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
A[develop branch] --> B[Auto Deploy DEV]
|
||||
B --> C[Tests Integration]
|
||||
C --> D[main branch]
|
||||
D --> E[Manual Deploy STAGING]
|
||||
E --> F[Tests Load]
|
||||
F --> G[v*.*.* tag]
|
||||
G --> H[Manual Deploy PROD]
|
||||
H --> I[Health Checks]
|
||||
```
|
||||
|
||||
### Commandes par Environnement
|
||||
|
||||
```bash
|
||||
# Development (auto sur push develop)
|
||||
terraform -chdir=terraform/environments/development apply -auto-approve
|
||||
python3 inventories/generate_inventory.py development
|
||||
ansible-playbook -i inventories/development/hosts.yml site.yml
|
||||
|
||||
# Staging (manuel sur main)
|
||||
terraform -chdir=terraform/environments/staging apply
|
||||
python3 inventories/generate_inventory.py staging
|
||||
ansible-playbook -i inventories/staging/hosts.yml site.yml --check
|
||||
ansible-playbook -i inventories/staging/hosts.yml site.yml
|
||||
|
||||
# Production (manuel sur tag)
|
||||
terraform -chdir=terraform/environments/production apply
|
||||
python3 inventories/generate_inventory.py production
|
||||
ansible-playbook -i inventories/production/hosts.yml site.yml --check
|
||||
# Confirmation manuelle requise
|
||||
ansible-playbook -i inventories/production/hosts.yml site.yml
|
||||
```
|
||||
|
||||
## Configuration par Environnement
|
||||
|
||||
### Development
|
||||
- **OS** : Ubuntu 24.04 LTS
|
||||
- **Serveurs** : 1x CX31 (CPU-only)
|
||||
- **Modèle** : DialoGPT-small (léger)
|
||||
- **Déploiement** : Automatique sur develop
|
||||
- **Tests** : Integration uniquement
|
||||
|
||||
### Staging
|
||||
- **OS** : Ubuntu 24.04 LTS
|
||||
- **Serveurs** : 1x GEX44 + 1x CX21
|
||||
- **Modèle** : Mixtral-8x7B (quantized)
|
||||
- **Déploiement** : Manuel sur main
|
||||
- **Tests** : Integration + Load
|
||||
|
||||
### Production
|
||||
- **OS** : Ubuntu 24.04 LTS
|
||||
- **Serveurs** : 3x GEX44 + 2x CX31 + 1x CX21
|
||||
- **Modèle** : Mixtral-8x7B (optimized)
|
||||
- **Déploiement** : Manuel sur tag + confirmation
|
||||
- **Tests** : Smoke + Health checks
|
||||
|
||||
## Rollback Procedures
|
||||
|
||||
### Rollback Application
|
||||
```bash
|
||||
# Via MLflow (recommandé)
|
||||
python3 scripts/rollback_model.py --environment production --version previous
|
||||
|
||||
# Via Ansible tags
|
||||
ansible-playbook -i inventories/production/hosts.yml site.yml --tags "vllm" --extra-vars "model_version=v1.2.0"
|
||||
```
|
||||
|
||||
### Rollback Infrastructure
|
||||
```bash
|
||||
# Terraform state rollback
|
||||
terraform -chdir=terraform/environments/production state pull > backup.tfstate
|
||||
terraform -chdir=terraform/environments/production import <resource> <id>
|
||||
|
||||
# Ansible configuration rollback
|
||||
git checkout <previous-commit> ansible/
|
||||
ansible-playbook -i inventories/production/hosts.yml site.yml --check
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Diagnostic Commands
|
||||
```bash
|
||||
# Vérification système Ubuntu 24.04
|
||||
ansible all -i inventories/production/hosts.yml -m setup -a "filter=ansible_distribution*"
|
||||
|
||||
# Status services
|
||||
ansible gex44_production -i inventories/production/hosts.yml -m systemd -a "name=vllm-api"
|
||||
|
||||
# Logs applicatifs
|
||||
ansible gex44_production -i inventories/production/hosts.yml -m shell -a "journalctl -u vllm-api --since '1 hour ago'"
|
||||
|
||||
# GPU status
|
||||
ansible gex44_production -i inventories/production/hosts.yml -m shell -a "nvidia-smi"
|
||||
|
||||
# Test endpoints
|
||||
curl https://ai-api.company.com/health
|
||||
curl https://ai-api.company.com/v1/models
|
||||
```
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### GPU non détecté
|
||||
```bash
|
||||
# Vérifier driver NVIDIA sur Ubuntu 24.04
|
||||
sudo nvidia-smi
|
||||
sudo dkms status
|
||||
|
||||
# Réinstaller si nécessaire
|
||||
sudo apt purge nvidia-* -y
|
||||
sudo apt install nvidia-driver-545 -y
|
||||
sudo reboot
|
||||
```
|
||||
|
||||
#### Service vLLM failed
|
||||
```bash
|
||||
# Check logs
|
||||
journalctl -u vllm-api -f
|
||||
|
||||
# Common issues:
|
||||
# - OOM: Réduire gpu_memory_utilization
|
||||
# - Model not found: Vérifier path MLflow
|
||||
# - Port conflict: Netstat -tulpn | grep 8000
|
||||
```
|
||||
|
||||
#### Inventory generation failed
|
||||
```bash
|
||||
# Debug mode
|
||||
python3 inventories/generate_inventory.py production --debug
|
||||
|
||||
# Manual verification
|
||||
terraform -chdir=terraform/environments/production output -json > outputs.json
|
||||
cat outputs.json | jq '.'
|
||||
```
|
||||
|
||||
## Security Checklist
|
||||
|
||||
### Pre-deployment
|
||||
- [ ] SSH keys deployed sur Ubuntu 24.04
|
||||
- [ ] Firewall rules configured
|
||||
- [ ] Secrets in Ansible Vault
|
||||
- [ ] SSL certificates ready
|
||||
|
||||
### Post-deployment
|
||||
- [ ] SSH access working
|
||||
- [ ] Services running (systemctl status)
|
||||
- [ ] Endpoints responding
|
||||
- [ ] Monitoring active
|
||||
- [ ] Log aggregation working
|
||||
|
||||
## Performance Validation
|
||||
|
||||
### Load Testing
|
||||
```bash
|
||||
# Development - CPU only
|
||||
python3 tests/load_test.py --endpoint https://dev-ai-api.internal --concurrent 5
|
||||
|
||||
# Staging - 1 GPU
|
||||
python3 tests/load_test.py --endpoint https://staging-ai-api.company.com --concurrent 20
|
||||
|
||||
# Production - 3 GPU
|
||||
python3 tests/load_test.py --endpoint https://ai-api.company.com --concurrent 100
|
||||
```
|
||||
|
||||
### Expected Performance
|
||||
- **Development** : 1-5 tokens/sec (CPU simulation)
|
||||
- **Staging** : 80-90 tokens/sec (1x RTX 4000 Ada)
|
||||
- **Production** : 240-270 tokens/sec (3x RTX 4000 Ada)
|
||||
249
docs/tools.md
Normal file
249
docs/tools.md
Normal file
@ -0,0 +1,249 @@
|
||||
# Tools & Technologies
|
||||
|
||||
## Core Infrastructure
|
||||
|
||||
### Infrastructure as Code
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Terraform** | 1.12+ | Infrastructure provisioning | MPL-2.0 |
|
||||
| **Hetzner Provider** | 1.45+ | Hetzner Cloud resources | MPL-2.0 |
|
||||
|
||||
### Configuration Management
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Ansible** | 8.0+ | Server configuration | GPL-3.0 |
|
||||
| **Ansible Vault** | Included | Secrets management | GPL-3.0 |
|
||||
|
||||
## Operating System & Runtime
|
||||
|
||||
### Base System
|
||||
| Component | Version | Purpose | Support |
|
||||
|-----------|---------|---------|---------|
|
||||
| **Ubuntu Server** | 24.04 LTS | Base operating system | Until 2034 |
|
||||
| **Docker** | 24.0.x | Container runtime | Docker Inc. |
|
||||
| **systemd** | 253+ | Service management | Built-in |
|
||||
|
||||
### GPU Stack
|
||||
| Component | Version | Purpose | Support |
|
||||
|-----------|---------|---------|---------|
|
||||
| **NVIDIA Driver** | 545.23.08 | GPU driver | NVIDIA |
|
||||
| **CUDA Toolkit** | 12.3+ | GPU computing | NVIDIA |
|
||||
| **NVIDIA Container Toolkit** | 1.14+ | Docker GPU support | NVIDIA |
|
||||
|
||||
## AI/ML Stack
|
||||
|
||||
### Inference Engine
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **vLLM** | Latest | LLM inference server | Apache-2.0 |
|
||||
| **PyTorch** | 2.5.0+ | Deep learning framework | BSD-3 |
|
||||
| **Transformers** | 4.46.0+ | Model library | Apache-2.0 |
|
||||
| **Accelerate** | 0.34.0+ | Training acceleration | Apache-2.0 |
|
||||
|
||||
### Model Management
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **MLflow** | 2.8+ | Model lifecycle management | Apache-2.0 |
|
||||
| **Hugging Face Hub** | 0.25.0+ | Model repository | Apache-2.0 |
|
||||
|
||||
### Quantization
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **AWQ** | Latest | 4-bit quantization | MIT |
|
||||
| **GPTQ** | Latest | Alternative quantization | MIT |
|
||||
| **TorchAO** | Nightly | Advanced optimizations | BSD-3 |
|
||||
|
||||
## Networking & Load Balancing
|
||||
|
||||
### Load Balancing
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **HAProxy** | 2.8+ | Load balancer | GPL-2.0 |
|
||||
| **Keepalived** | 2.2+ | High availability | GPL-2.0 |
|
||||
|
||||
### SSL/TLS
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Let's Encrypt** | Current | Free SSL certificates | ISRG |
|
||||
| **Certbot** | 2.7+ | Certificate automation | Apache-2.0 |
|
||||
|
||||
## Monitoring & Observability
|
||||
|
||||
### Core Monitoring
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Prometheus** | 2.47+ | Metrics collection | Apache-2.0 |
|
||||
| **Grafana** | 10.2+ | Metrics visualization | AGPL-3.0 |
|
||||
| **AlertManager** | 0.26+ | Alert routing | Apache-2.0 |
|
||||
|
||||
### Exporters
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Node Exporter** | 1.7+ | System metrics | Apache-2.0 |
|
||||
| **nvidia-smi Exporter** | Custom | GPU metrics | MIT |
|
||||
| **HAProxy Exporter** | 0.15+ | Load balancer metrics | Apache-2.0 |
|
||||
|
||||
### Log Management
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **systemd-journald** | Built-in | Log collection | GPL-2.0 |
|
||||
| **Logrotate** | 3.21+ | Log rotation | GPL-2.0 |
|
||||
|
||||
## CI/CD & Development
|
||||
|
||||
### CI/CD Platform
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **GitLab** | 16.0+ | CI/CD pipeline | MIT |
|
||||
| **GitLab Runner** | 16.0+ | Job execution | MIT |
|
||||
|
||||
### Development Tools
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Python** | 3.12+ | Scripting language | PSF |
|
||||
| **pip** | 23.0+ | Package manager | MIT |
|
||||
| **Poetry** | 1.7+ | Dependency management | MIT |
|
||||
|
||||
### Testing
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **pytest** | 7.4+ | Python testing | MIT |
|
||||
| **requests** | 2.31+ | HTTP testing | Apache-2.0 |
|
||||
| **locust** | 2.17+ | Load testing | MIT |
|
||||
|
||||
## Security & Compliance
|
||||
|
||||
### Firewall & Security
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **ufw** | 0.36+ | Firewall management | GPL-3.0 |
|
||||
| **fail2ban** | 1.0+ | Intrusion prevention | GPL-2.0 |
|
||||
| **SSH** | OpenSSH 9.3+ | Secure access | BSD |
|
||||
|
||||
### Secrets Management
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Ansible Vault** | Built-in | Configuration secrets | GPL-3.0 |
|
||||
| **GitLab CI Variables** | Built-in | CI/CD secrets | MIT |
|
||||
|
||||
## Cloud Provider APIs
|
||||
|
||||
### Hetzner Services
|
||||
| Service | API Version | Purpose | Pricing |
|
||||
|---------|-------------|---------|---------|
|
||||
| **Hetzner Cloud** | v1 | Cloud resources | Pay-per-use |
|
||||
| **Hetzner Robot** | v1 | Dedicated servers | Monthly |
|
||||
| **Hetzner DNS** | v1 | DNS management | Free |
|
||||
|
||||
## Backup & Storage
|
||||
|
||||
### Storage Solutions
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **rsync** | 3.2+ | File synchronization | GPL-3.0 |
|
||||
| **tar** | 1.34+ | Archive creation | GPL-3.0 |
|
||||
| **gzip** | 1.12+ | Compression | GPL-3.0 |
|
||||
|
||||
### Cloud Storage
|
||||
| Service | Purpose | Pricing |
|
||||
|---------|---------|---------|
|
||||
| **Hetzner Storage Box** | Backup storage | €0.0104/GB/month |
|
||||
| **Hetzner Cloud Volumes** | Block storage | €0.0476/GB/month |
|
||||
|
||||
## Performance & Optimization
|
||||
|
||||
### System Optimization
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **htop** | 3.2+ | Process monitoring | GPL-2.0 |
|
||||
| **iotop** | 0.6+ | I/O monitoring | GPL-2.0 |
|
||||
| **nvidia-smi** | Included | GPU monitoring | NVIDIA |
|
||||
|
||||
### Network Optimization
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **iperf3** | 3.12+ | Network testing | BSD-3 |
|
||||
| **tc** | Built-in | Traffic control | GPL-2.0 |
|
||||
|
||||
## Documentation & Collaboration
|
||||
|
||||
### Documentation
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Markdown** | CommonMark | Documentation format | BSD |
|
||||
| **Mermaid** | 10.6+ | Diagram generation | MIT |
|
||||
|
||||
### Version Control
|
||||
| Tool | Version | Purpose | License |
|
||||
|------|---------|---------|---------|
|
||||
| **Git** | 2.40+ | Version control | GPL-2.0 |
|
||||
| **Git LFS** | 3.4+ | Large file storage | MIT |
|
||||
|
||||
## Installation Commands
|
||||
|
||||
### Ubuntu 24.04 Setup
|
||||
```bash
|
||||
# Update system
|
||||
sudo apt update && sudo apt upgrade -y
|
||||
|
||||
# Install core tools
|
||||
sudo apt install -y curl wget git python3-pip
|
||||
|
||||
# Install Docker
|
||||
curl -fsSL https://get.docker.com -o get-docker.sh
|
||||
sudo sh get-docker.sh
|
||||
|
||||
# Install NVIDIA drivers (sur GEX44)
|
||||
sudo apt install -y nvidia-driver-545
|
||||
sudo nvidia-smi
|
||||
|
||||
# Install Terraform
|
||||
wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
|
||||
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
|
||||
sudo apt update && sudo apt install -y terraform
|
||||
|
||||
# Install Ansible
|
||||
sudo apt install -y ansible
|
||||
|
||||
# Install Python dependencies
|
||||
pip3 install mlflow requests prometheus-client
|
||||
```
|
||||
|
||||
### Verification Commands
|
||||
```bash
|
||||
# Verify versions
|
||||
terraform version
|
||||
ansible --version
|
||||
docker version
|
||||
python3 --version
|
||||
|
||||
# Verify GPU (sur GEX44)
|
||||
nvidia-smi
|
||||
docker run --rm --gpus all nvidia/cuda:12.3-runtime-ubuntu22.04 nvidia-smi
|
||||
```
|
||||
|
||||
## Architecture Compatibility
|
||||
|
||||
### Supported Hardware
|
||||
- **CPU** : Intel x86_64, AMD x86_64
|
||||
- **GPU** : NVIDIA RTX 4000 Ada (Compute Capability 8.9)
|
||||
- **Memory** : 64GB DDR4 minimum
|
||||
- **Storage** : NVMe SSD minimum
|
||||
|
||||
### Network Requirements
|
||||
- **Bandwidth** : 1 Gbps minimum
|
||||
- **Latency** : < 10ms intra-datacenter
|
||||
- **Ports** : 22 (SSH), 80/443 (HTTP/HTTPS), 8000 (vLLM), 9090-9100 (Monitoring)
|
||||
|
||||
## License Compliance
|
||||
|
||||
### Open Source Components
|
||||
- **GPL-licensed** : Linux kernel, systemd, Ansible
|
||||
- **Apache-licensed** : Terraform, MLflow, Prometheus
|
||||
- **MIT-licensed** : Docker, GitLab, pytest
|
||||
- **BSD-licensed** : PyTorch, OpenSSH
|
||||
|
||||
### Proprietary Components
|
||||
- **NVIDIA drivers** : NVIDIA License (redistribution restrictions)
|
||||
- **Hetzner services** : Commercial terms
|
||||
- **GitLab Enterprise** : Commercial (si utilisé)
|
||||
118
inventories/README.md
Normal file
118
inventories/README.md
Normal file
@ -0,0 +1,118 @@
|
||||
# Inventaires Infrastructure
|
||||
|
||||
Structure organisée pour séparer les besoins métier (Terraform) des configurations serveurs (Ansible).
|
||||
|
||||
## Structure
|
||||
|
||||
```
|
||||
inventories/
|
||||
├── terraform/ # INPUTS : Requirements métier par environnement
|
||||
│ ├── development/
|
||||
│ │ └── requirements.yml # Besoins dev (CPU-only, coûts limités)
|
||||
│ ├── staging/
|
||||
│ │ └── requirements.yml # Besoins staging (1 GPU, tests complets)
|
||||
│ └── production/
|
||||
│ └── requirements.yml # Besoins prod (3 GPU, HA, monitoring)
|
||||
│
|
||||
└── ansible/ # OUTPUTS : Inventaires générés pour configuration
|
||||
├── development/
|
||||
│ └── hosts.yml # Inventaire dev généré par Terraform
|
||||
├── staging/
|
||||
│ └── hosts.yml # Inventaire staging généré par Terraform
|
||||
└── production/
|
||||
└── hosts.yml # Inventaire prod généré par Terraform
|
||||
```
|
||||
|
||||
## Principe
|
||||
|
||||
**`terraform/`** = **INPUTS** (ce qu'on veut)
|
||||
**`ansible/`** = **OUTPUTS** (ce qui est déployé)
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1. Définition des besoins (Terraform)
|
||||
```yaml
|
||||
# inventories/terraform/production/requirements.yml
|
||||
environment: production
|
||||
infrastructure:
|
||||
compute:
|
||||
gex44_nodes: 3
|
||||
models:
|
||||
primary: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
security:
|
||||
ssl_certificates:
|
||||
- name: "ai-api-prod"
|
||||
domains: ["ai-api.company.com"]
|
||||
```
|
||||
|
||||
### 2. Génération automatique (Terraform)
|
||||
```bash
|
||||
# Le module Terraform lit requirements.yml et génère hosts.yml
|
||||
terraform apply
|
||||
# → Crée inventories/ansible/production/hosts.yml
|
||||
```
|
||||
|
||||
### 3. Configuration serveurs (Ansible)
|
||||
```bash
|
||||
# Ansible utilise l'inventaire généré
|
||||
ansible-playbook -i inventories/ansible/production/hosts.yml site.yml
|
||||
```
|
||||
|
||||
## Avantages de cette séparation
|
||||
|
||||
### Terraform (`requirements.yml`)
|
||||
- **Besoins métier** : Combien de GPU ? Quel modèle ?
|
||||
- **Contraintes budget** : Coûts par environnement
|
||||
- **Politique sécurité** : Certificats, domaines, firewall
|
||||
- **Évolutif** : Facile à modifier sans connaître Ansible
|
||||
|
||||
### Ansible (`hosts.yml`)
|
||||
- **Configuration technique** : IPs, ports, versions
|
||||
- **Détails serveurs** : Spécifications hardware
|
||||
- **Variables d'exécution** : Passwords, certificats
|
||||
- **Généré automatiquement** : Toujours sync avec Terraform
|
||||
|
||||
## Exemple d'utilisation
|
||||
|
||||
### Development
|
||||
```bash
|
||||
# 1. Définir besoins
|
||||
vim inventories/terraform/development/requirements.yml
|
||||
|
||||
# 2. Déployer infrastructure
|
||||
terraform -chdir=terraform/environments/development apply
|
||||
|
||||
# 3. Configurer serveurs
|
||||
ansible-playbook -i inventories/ansible/development/hosts.yml site.yml --limit development
|
||||
```
|
||||
|
||||
### Production
|
||||
```bash
|
||||
# 1. Valider besoins business
|
||||
vim inventories/terraform/production/requirements.yml
|
||||
|
||||
# 2. Planifier infrastructure
|
||||
terraform -chdir=terraform/environments/production plan
|
||||
|
||||
# 3. Déployer avec confirmation
|
||||
terraform -chdir=terraform/environments/production apply
|
||||
|
||||
# 4. Configurer avec vérification
|
||||
ansible-playbook -i inventories/ansible/production/hosts.yml site.yml --check --limit production
|
||||
ansible-playbook -i inventories/ansible/production/hosts.yml site.yml --limit production
|
||||
```
|
||||
|
||||
## Maintenance
|
||||
|
||||
### Modification des besoins
|
||||
1. Modifier `inventories/terraform/{env}/requirements.yml`
|
||||
2. Lancer `terraform plan` pour voir les changements
|
||||
3. Appliquer avec `terraform apply`
|
||||
4. L'inventaire Ansible se met à jour automatiquement
|
||||
|
||||
### Ajout d'un environnement
|
||||
1. Créer `inventories/terraform/preproduction/requirements.yml`
|
||||
2. Créer `terraform/environments/preproduction/`
|
||||
3. L'inventaire Ansible sera généré au premier `terraform apply`
|
||||
|
||||
Cette structure sépare clairement la **stratégie business** (requirements) de la **technique d'implémentation** (hosts), facilitant la maintenance et les évolutions.
|
||||
37
inventories/ansible/development/hosts.yml
Normal file
37
inventories/ansible/development/hosts.yml
Normal file
@ -0,0 +1,37 @@
|
||||
# inventories/ansible/development/hosts.yml
|
||||
# Generated by Terraform - Development Ansible inventory
|
||||
all:
|
||||
vars:
|
||||
environment: development
|
||||
os_family: ubuntu
|
||||
os_version: "24.04"
|
||||
ansible_user: ubuntu
|
||||
python_interpreter: /usr/bin/python3
|
||||
ansible_ssh_private_key_file: ~/.ssh/hetzner-development
|
||||
|
||||
children:
|
||||
dev_servers:
|
||||
hosts:
|
||||
dev-ai-server:
|
||||
ansible_host: 95.217.126.30
|
||||
private_ip: 10.1.1.10
|
||||
cpu_only: true
|
||||
vllm_port: 8000
|
||||
vars:
|
||||
docker_version: "24.0.*"
|
||||
ubuntu_version: "24.04"
|
||||
model_name: "microsoft/DialoGPT-small"
|
||||
quantization: "none"
|
||||
gpu_simulation: true
|
||||
|
||||
monitoring:
|
||||
hosts:
|
||||
monitoring-development:
|
||||
ansible_host: 95.217.126.30
|
||||
private_ip: 10.1.1.10
|
||||
prometheus_retention: 7d
|
||||
alert_severity: info
|
||||
vars:
|
||||
prometheus_version: "2.47.2"
|
||||
grafana_version: "10.2.0"
|
||||
ubuntu_version: "24.04"
|
||||
74
inventories/ansible/production/hosts.yml
Normal file
74
inventories/ansible/production/hosts.yml
Normal file
@ -0,0 +1,74 @@
|
||||
# inventories/ansible/production/hosts.yml
|
||||
# Generated by Terraform - Production Ansible inventory
|
||||
all:
|
||||
vars:
|
||||
environment: production
|
||||
os_family: ubuntu
|
||||
os_version: "24.04"
|
||||
ansible_user: ubuntu
|
||||
python_interpreter: /usr/bin/python3
|
||||
ansible_ssh_private_key_file: ~/.ssh/hetzner-production
|
||||
|
||||
children:
|
||||
load_balancer:
|
||||
hosts:
|
||||
lb-1-production:
|
||||
ansible_host: 95.217.123.45
|
||||
private_ip: 10.0.1.10
|
||||
role: primary
|
||||
haproxy_priority: 100
|
||||
lb-2-production:
|
||||
ansible_host: 95.217.123.46
|
||||
private_ip: 10.0.1.11
|
||||
role: backup
|
||||
haproxy_priority: 90
|
||||
vars:
|
||||
haproxy_backend_servers:
|
||||
- 10.0.1.101
|
||||
- 10.0.1.102
|
||||
- 10.0.1.103
|
||||
ssl_certificate_type: commercial
|
||||
ssl_certificates:
|
||||
- name: "ai-api-prod"
|
||||
domains: ["ai-api.company.com", "*.ai-api.company.com"]
|
||||
type: "commercial"
|
||||
|
||||
gex44_production:
|
||||
hosts:
|
||||
gex44-prod-1:
|
||||
ansible_host: 95.217.124.10
|
||||
private_ip: 10.0.1.101
|
||||
gpu_type: RTX_4000_Ada_20GB
|
||||
vllm_port: 8000
|
||||
metrics_port: 9400
|
||||
gex44-prod-2:
|
||||
ansible_host: 95.217.124.11
|
||||
private_ip: 10.0.1.102
|
||||
gpu_type: RTX_4000_Ada_20GB
|
||||
vllm_port: 8000
|
||||
metrics_port: 9400
|
||||
gex44-prod-3:
|
||||
ansible_host: 95.217.124.12
|
||||
private_ip: 10.0.1.103
|
||||
gpu_type: RTX_4000_Ada_20GB
|
||||
vllm_port: 8000
|
||||
metrics_port: 9400
|
||||
vars:
|
||||
nvidia_driver_version: "545.23.08"
|
||||
docker_version: "24.0.*"
|
||||
ubuntu_version: "24.04"
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
quantization: "awq"
|
||||
gpu_memory_utilization: 0.95
|
||||
|
||||
monitoring:
|
||||
hosts:
|
||||
monitoring-production:
|
||||
ansible_host: 95.217.125.20
|
||||
private_ip: 10.0.1.20
|
||||
prometheus_retention: 90d
|
||||
alert_severity: critical
|
||||
vars:
|
||||
prometheus_version: "2.47.2"
|
||||
grafana_version: "10.2.0"
|
||||
ubuntu_version: "24.04"
|
||||
53
inventories/ansible/staging/hosts.yml
Normal file
53
inventories/ansible/staging/hosts.yml
Normal file
@ -0,0 +1,53 @@
|
||||
# inventories/ansible/staging/hosts.yml
|
||||
# Generated by Terraform - Staging Ansible inventory
|
||||
all:
|
||||
vars:
|
||||
environment: staging
|
||||
os_family: ubuntu
|
||||
os_version: "24.04"
|
||||
ansible_user: ubuntu
|
||||
python_interpreter: /usr/bin/python3
|
||||
ansible_ssh_private_key_file: ~/.ssh/hetzner-staging
|
||||
|
||||
children:
|
||||
load_balancer:
|
||||
hosts:
|
||||
staging-lb:
|
||||
ansible_host: 95.217.127.40
|
||||
private_ip: 10.2.1.10
|
||||
role: single
|
||||
vars:
|
||||
haproxy_backend_servers:
|
||||
- 10.2.1.101
|
||||
ssl_certificates:
|
||||
- name: "staging-ai-api"
|
||||
domains: ["staging-ai-api.company.com"]
|
||||
type: "letsencrypt"
|
||||
|
||||
gex44_staging:
|
||||
hosts:
|
||||
gex44-staging-1:
|
||||
ansible_host: 95.217.128.50
|
||||
private_ip: 10.2.1.101
|
||||
gpu_type: RTX_4000_Ada_20GB
|
||||
vllm_port: 8000
|
||||
metrics_port: 9400
|
||||
vars:
|
||||
nvidia_driver_version: "545.23.08"
|
||||
docker_version: "24.0.*"
|
||||
ubuntu_version: "24.04"
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
quantization: "awq"
|
||||
gpu_memory_utilization: 0.80
|
||||
|
||||
monitoring:
|
||||
hosts:
|
||||
monitoring-staging:
|
||||
ansible_host: 95.217.127.41
|
||||
private_ip: 10.2.1.20
|
||||
prometheus_retention: 30d
|
||||
alert_severity: warning
|
||||
vars:
|
||||
prometheus_version: "2.47.2"
|
||||
grafana_version: "10.2.0"
|
||||
ubuntu_version: "24.04"
|
||||
70
inventories/terraform/development/requirements.yml
Normal file
70
inventories/terraform/development/requirements.yml
Normal file
@ -0,0 +1,70 @@
|
||||
# inventories/development/requirements.yml
|
||||
# Infrastructure requirements for Development environment
|
||||
|
||||
environment: development
|
||||
cost_budget: 50 # EUR/month
|
||||
|
||||
infrastructure:
|
||||
compute:
|
||||
gex44_nodes: 0 # Use CPU simulation instead
|
||||
cloud_servers:
|
||||
- name: dev-ai-server
|
||||
type: cx31
|
||||
cpu: 4
|
||||
ram: 8
|
||||
disk: 80
|
||||
gpu_simulation: true
|
||||
|
||||
network:
|
||||
private_network: "10.1.0.0/16"
|
||||
subnet: "10.1.1.0/24"
|
||||
|
||||
monitoring:
|
||||
enabled: true
|
||||
retention: 7d
|
||||
server_type: cx11
|
||||
|
||||
models:
|
||||
primary: "microsoft/DialoGPT-small"
|
||||
quantization: none
|
||||
max_context: 1024
|
||||
gpu_memory_limit: 0.5
|
||||
|
||||
scaling:
|
||||
min_nodes: 1
|
||||
max_nodes: 1
|
||||
auto_scaling: false
|
||||
|
||||
security:
|
||||
firewall_rules:
|
||||
- port: 22
|
||||
protocol: tcp
|
||||
source: "office_ips"
|
||||
- port: 8000
|
||||
protocol: tcp
|
||||
source: "internal_network"
|
||||
ssl_certificates:
|
||||
- name: "dev-ai-api"
|
||||
type: "letsencrypt"
|
||||
domains:
|
||||
- "dev-ai-api.internal"
|
||||
dns_provider: "hetzner"
|
||||
tags:
|
||||
- "development"
|
||||
- "api"
|
||||
- "internal"
|
||||
auto_renewal: true
|
||||
key_size: 2048
|
||||
|
||||
integrations:
|
||||
mlflow:
|
||||
url: "http://mlflow-dev.internal:5000"
|
||||
experiments: true
|
||||
model_registry: false
|
||||
|
||||
monitoring:
|
||||
prometheus_retention: 7d
|
||||
alert_severity: info
|
||||
|
||||
backup:
|
||||
enabled: false
|
||||
155
inventories/terraform/production/requirements.yml
Normal file
155
inventories/terraform/production/requirements.yml
Normal file
@ -0,0 +1,155 @@
|
||||
# inventories/production/requirements.yml
|
||||
# Infrastructure requirements for Production environment
|
||||
|
||||
environment: production
|
||||
cost_budget: 700 # EUR/month
|
||||
|
||||
infrastructure:
|
||||
compute:
|
||||
gex44_nodes: 3
|
||||
specifications:
|
||||
- name: gex44-prod-1
|
||||
gpu: RTX_4000_Ada_20GB
|
||||
cpu: Intel_i5_13500
|
||||
ram: 64
|
||||
nvme: 2x1TB
|
||||
- name: gex44-prod-2
|
||||
gpu: RTX_4000_Ada_20GB
|
||||
cpu: Intel_i5_13500
|
||||
ram: 64
|
||||
nvme: 2x1TB
|
||||
- name: gex44-prod-3
|
||||
gpu: RTX_4000_Ada_20GB
|
||||
cpu: Intel_i5_13500
|
||||
ram: 64
|
||||
nvme: 2x1TB
|
||||
|
||||
cloud_servers:
|
||||
- name: prod-lb-1
|
||||
type: cx31
|
||||
cpu: 4
|
||||
ram: 8
|
||||
disk: 80
|
||||
role: load_balancer
|
||||
ha: true
|
||||
- name: prod-lb-2
|
||||
type: cx31
|
||||
cpu: 4
|
||||
ram: 8
|
||||
disk: 80
|
||||
role: load_balancer_backup
|
||||
ha: true
|
||||
- name: prod-monitoring
|
||||
type: cx21
|
||||
cpu: 2
|
||||
ram: 4
|
||||
disk: 40
|
||||
role: monitoring
|
||||
|
||||
network:
|
||||
private_network: "10.0.0.0/16"
|
||||
subnet: "10.0.1.0/24"
|
||||
load_balancer_ips:
|
||||
- "10.0.1.10"
|
||||
- "10.0.1.11"
|
||||
gex44_ips:
|
||||
- "10.0.1.101"
|
||||
- "10.0.1.102"
|
||||
- "10.0.1.103"
|
||||
|
||||
storage:
|
||||
volumes:
|
||||
- name: models-storage
|
||||
size: 100
|
||||
type: nvme
|
||||
- name: monitoring-data
|
||||
size: 50
|
||||
type: nvme
|
||||
- name: backups
|
||||
size: 200
|
||||
type: standard
|
||||
|
||||
monitoring:
|
||||
enabled: true
|
||||
retention: 90d
|
||||
high_availability: true
|
||||
external_monitoring: true
|
||||
|
||||
models:
|
||||
primary: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
quantization: awq
|
||||
max_context: 4096
|
||||
gpu_memory_limit: 0.95
|
||||
fallback_model: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
|
||||
scaling:
|
||||
min_nodes: 2
|
||||
max_nodes: 5
|
||||
auto_scaling: true
|
||||
scale_up_threshold: 0.80
|
||||
scale_down_threshold: 0.30
|
||||
cooldown_period: 600 # seconds
|
||||
|
||||
security:
|
||||
firewall_rules:
|
||||
- port: 443
|
||||
protocol: tcp
|
||||
source: "0.0.0.0/0"
|
||||
- port: 22
|
||||
protocol: tcp
|
||||
source: "admin_ips"
|
||||
- port: 8000
|
||||
protocol: tcp
|
||||
source: "load_balancer_ips"
|
||||
ssl_certificates:
|
||||
- name: "ai-api-prod"
|
||||
type: "commercial" # letsencrypt, commercial, self-signed
|
||||
domains:
|
||||
- "ai-api.company.com"
|
||||
- "*.ai-api.company.com"
|
||||
dns_provider: "hetzner" # hetzner, cloudflare, route53
|
||||
tags:
|
||||
- "production"
|
||||
- "api"
|
||||
- "wildcard"
|
||||
auto_renewal: true
|
||||
key_size: 2048
|
||||
- name: "monitoring-prod"
|
||||
type: "letsencrypt"
|
||||
domains:
|
||||
- "monitoring-prod.company.com"
|
||||
dns_provider: "hetzner"
|
||||
tags:
|
||||
- "production"
|
||||
- "monitoring"
|
||||
- "internal"
|
||||
auto_renewal: true
|
||||
key_size: 2048
|
||||
waf_enabled: true
|
||||
intrusion_detection: true
|
||||
|
||||
integrations:
|
||||
mlflow:
|
||||
url: "https://mlflow-prod.company.com:5000"
|
||||
experiments: true
|
||||
model_registry: true
|
||||
backup_enabled: true
|
||||
|
||||
monitoring:
|
||||
prometheus_retention: 90d
|
||||
alert_severity: critical
|
||||
external_integrations:
|
||||
- pagerduty
|
||||
- slack
|
||||
|
||||
backup:
|
||||
enabled: true
|
||||
frequency: daily
|
||||
retention: 30d
|
||||
encryption: true
|
||||
|
||||
compliance:
|
||||
gdpr: true
|
||||
data_residency: eu
|
||||
audit_logging: true
|
||||
access_control: rbac
|
||||
87
inventories/terraform/staging/requirements.yml
Normal file
87
inventories/terraform/staging/requirements.yml
Normal file
@ -0,0 +1,87 @@
|
||||
# inventories/terraform/staging/requirements.yml
|
||||
# Infrastructure requirements for Staging environment
|
||||
|
||||
environment: staging
|
||||
cost_budget: 250 # EUR/month
|
||||
|
||||
infrastructure:
|
||||
compute:
|
||||
gex44_nodes: 1
|
||||
specifications:
|
||||
- name: gex44-staging-1
|
||||
gpu: RTX_4000_Ada_20GB
|
||||
cpu: Intel_i5_13500
|
||||
ram: 64
|
||||
nvme: 2x1TB
|
||||
|
||||
cloud_servers:
|
||||
- name: staging-lb
|
||||
type: cx21
|
||||
cpu: 2
|
||||
ram: 4
|
||||
disk: 40
|
||||
role: load_balancer
|
||||
- name: staging-monitoring
|
||||
type: cx11
|
||||
cpu: 1
|
||||
ram: 4
|
||||
disk: 20
|
||||
role: monitoring
|
||||
|
||||
network:
|
||||
private_network: "10.2.0.0/16"
|
||||
subnet: "10.2.1.0/24"
|
||||
load_balancer_ip: "10.2.1.10"
|
||||
gex44_ip: "10.2.1.101"
|
||||
|
||||
monitoring:
|
||||
enabled: true
|
||||
retention: 30d
|
||||
|
||||
models:
|
||||
primary: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
quantization: awq
|
||||
max_context: 2048
|
||||
gpu_memory_limit: 0.80
|
||||
|
||||
scaling:
|
||||
min_nodes: 1
|
||||
max_nodes: 2
|
||||
auto_scaling: true
|
||||
scale_up_threshold: 0.85
|
||||
scale_down_threshold: 0.40
|
||||
|
||||
security:
|
||||
firewall_rules:
|
||||
- port: 443
|
||||
protocol: tcp
|
||||
source: "0.0.0.0/0"
|
||||
- port: 22
|
||||
protocol: tcp
|
||||
source: "office_ips"
|
||||
ssl_certificates:
|
||||
- name: "staging-ai-api"
|
||||
type: "letsencrypt"
|
||||
domains:
|
||||
- "staging-ai-api.company.com"
|
||||
dns_provider: "hetzner"
|
||||
tags:
|
||||
- "staging"
|
||||
- "api"
|
||||
- "external"
|
||||
auto_renewal: true
|
||||
key_size: 2048
|
||||
|
||||
integrations:
|
||||
mlflow:
|
||||
url: "https://mlflow-staging.internal:5000"
|
||||
experiments: true
|
||||
model_registry: true
|
||||
|
||||
monitoring:
|
||||
prometheus_retention: 30d
|
||||
alert_severity: warning
|
||||
|
||||
backup:
|
||||
enabled: true
|
||||
frequency: weekly
|
||||
303
monitoring/grafana/dashboards/gpu-metrics.json
Normal file
303
monitoring/grafana/dashboards/gpu-metrics.json
Normal file
@ -0,0 +1,303 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "GPU Performance & Utilization",
|
||||
"tags": ["gpu", "nvidia", "performance"],
|
||||
"style": "dark",
|
||||
"timezone": "UTC",
|
||||
"refresh": "10s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "GPU Utilization",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nvidia_smi_utilization_gpu_ratio * 100",
|
||||
"legendFormat": "GPU {{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 70},
|
||||
{"color": "red", "value": 90}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"values": ["current", "max", "mean"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "GPU Memory Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nvidia_smi_memory_used_bytes / nvidia_smi_memory_total_bytes * 100",
|
||||
"legendFormat": "Memory {{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 80},
|
||||
{"color": "red", "value": 95}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "GPU Temperature",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nvidia_smi_temperature_gpu",
|
||||
"legendFormat": "Temp {{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "celsius",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 75},
|
||||
{"color": "red", "value": 85}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "GPU Power Consumption",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nvidia_smi_power_draw_watts",
|
||||
"legendFormat": "Power {{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "watt",
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 200},
|
||||
{"color": "red", "value": 250}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Current GPU Stats",
|
||||
"type": "stat",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nvidia_smi_utilization_gpu_ratio * 100",
|
||||
"legendFormat": "{{instance}} GPU %",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "nvidia_smi_memory_used_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} Memory GB",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "nvidia_smi_temperature_gpu",
|
||||
"legendFormat": "{{instance}} Temp °C",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"decimals": 1
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Memory GB"},
|
||||
"properties": [{"id": "unit", "value": "decgbytes"}]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Temp °C"},
|
||||
"properties": [{"id": "unit", "value": "celsius"}]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": ""
|
||||
},
|
||||
"orientation": "horizontal",
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "GPU Memory Details",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nvidia_smi_memory_used_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} Used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "nvidia_smi_memory_free_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} Free",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "nvidia_smi_memory_total_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} Total",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "decgbytes",
|
||||
"min": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "GPU Processes",
|
||||
"type": "table",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 28
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nvidia_smi_utilization_encoder_ratio",
|
||||
"legendFormat": "Encoder {{instance}}",
|
||||
"refId": "A",
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "nvidia_smi_utilization_decoder_ratio",
|
||||
"legendFormat": "Decoder {{instance}}",
|
||||
"refId": "B",
|
||||
"format": "table"
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"name": "GPU Alerts",
|
||||
"enable": true,
|
||||
"iconColor": "rgba(255, 96, 96, 1)",
|
||||
"datasource": "Prometheus",
|
||||
"expr": "ALERTS{alertname=~\"GPU.*\"}"
|
||||
}
|
||||
]
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": "Prometheus",
|
||||
"query": "label_values(nvidia_smi_utilization_gpu_ratio, instance)",
|
||||
"multi": true,
|
||||
"includeAll": true,
|
||||
"allValue": ".*"
|
||||
}
|
||||
]
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Inference Performance",
|
||||
"url": "/d/inference-performance",
|
||||
"type": "dashboards"
|
||||
},
|
||||
{
|
||||
"title": "Cost Tracking",
|
||||
"url": "/d/cost-tracking",
|
||||
"type": "dashboards"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
417
monitoring/grafana/dashboards/inference-performance.json
Normal file
417
monitoring/grafana/dashboards/inference-performance.json
Normal file
@ -0,0 +1,417 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "AI Inference Performance",
|
||||
"tags": ["inference", "vllm", "performance", "latency"],
|
||||
"style": "dark",
|
||||
"timezone": "UTC",
|
||||
"refresh": "10s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Requests per Second",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(vllm_requests_total{status=\"200\"}[5m]))",
|
||||
"legendFormat": "Successful RPS",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(vllm_requests_total{status!=\"200\"}[5m]))",
|
||||
"legendFormat": "Error RPS",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(vllm_requests_total[5m]))",
|
||||
"legendFormat": "Total RPS",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"min": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Response Time Percentiles",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(vllm_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "P50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(vllm_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "P95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(vllm_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "P99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 2},
|
||||
{"color": "red", "value": 5}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Token Generation Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(vllm_tokens_generated_total[5m]))",
|
||||
"legendFormat": "Tokens/sec",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(vllm_tokens_generated_total[5m])) by (instance)",
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "tps",
|
||||
"min": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Queue Size",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(vllm_queue_size)",
|
||||
"legendFormat": "Total Queue",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "vllm_queue_size",
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 10},
|
||||
{"color": "red", "value": 50}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Error Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(vllm_requests_total{status!=\"200\"}[5m])) / sum(rate(vllm_requests_total[5m])) * 100",
|
||||
"legendFormat": "Error Rate %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "red", "value": 5}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Average Response Time",
|
||||
"type": "stat",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(vllm_request_duration_seconds_sum[5m])) / sum(rate(vllm_requests_total[5m]))",
|
||||
"legendFormat": "Avg Response",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 2},
|
||||
{"color": "red", "value": 5}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Throughput (Tokens/Request)",
|
||||
"type": "stat",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(vllm_tokens_generated_total[5m])) / sum(rate(vllm_requests_total{status=\"200\"}[5m]))",
|
||||
"legendFormat": "Avg Tokens/Request",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"decimals": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Active Connections",
|
||||
"type": "stat",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(vllm_active_connections)",
|
||||
"legendFormat": "Active Connections",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Model Performance by Instance",
|
||||
"type": "table",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(vllm_requests_total{status=\"200\"}[5m])",
|
||||
"legendFormat": "RPS",
|
||||
"refId": "A",
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(vllm_request_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "P95 Latency",
|
||||
"refId": "B",
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "rate(vllm_tokens_generated_total[5m])",
|
||||
"legendFormat": "Tokens/sec",
|
||||
"refId": "C",
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "vllm_queue_size",
|
||||
"legendFormat": "Queue Size",
|
||||
"refId": "D",
|
||||
"format": "table"
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"__name__": true,
|
||||
"job": true
|
||||
},
|
||||
"renameByName": {
|
||||
"instance": "Server",
|
||||
"Value #A": "RPS",
|
||||
"Value #B": "P95 Latency (s)",
|
||||
"Value #C": "Tokens/sec",
|
||||
"Value #D": "Queue"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Request Status Distribution",
|
||||
"type": "piechart",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 28
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(vllm_requests_total[5m])) by (status)",
|
||||
"legendFormat": "HTTP {{status}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": ""
|
||||
},
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Model Loading Time",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 28
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "vllm_model_load_duration_seconds",
|
||||
"legendFormat": "{{instance}} - {{model}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"name": "Inference Alerts",
|
||||
"enable": true,
|
||||
"iconColor": "rgba(255, 96, 96, 1)",
|
||||
"datasource": "Prometheus",
|
||||
"expr": "ALERTS{alertname=~\".*Inference.*|.*vLLM.*\"}"
|
||||
},
|
||||
{
|
||||
"name": "Deployments",
|
||||
"enable": true,
|
||||
"iconColor": "rgba(96, 255, 96, 1)",
|
||||
"datasource": "Prometheus",
|
||||
"expr": "increase(vllm_service_restarts_total[1h])"
|
||||
}
|
||||
]
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "model",
|
||||
"type": "query",
|
||||
"datasource": "Prometheus",
|
||||
"query": "label_values(vllm_requests_total, model)",
|
||||
"multi": true,
|
||||
"includeAll": true
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": "Prometheus",
|
||||
"query": "label_values(vllm_requests_total, instance)",
|
||||
"multi": true,
|
||||
"includeAll": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
342
monitoring/prometheus/alerts.yml
Normal file
342
monitoring/prometheus/alerts.yml
Normal file
@ -0,0 +1,342 @@
|
||||
# Prometheus alerting rules for AI Infrastructure
|
||||
groups:
|
||||
# GPU-specific alerts
|
||||
- name: gpu.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: GPUHighUtilization
|
||||
expr: nvidia_smi_utilization_gpu_ratio > 0.9
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: infrastructure
|
||||
component: gpu
|
||||
annotations:
|
||||
summary: "GPU utilization high on {{ $labels.instance }}"
|
||||
description: |
|
||||
GPU utilization has been above 90% for 10 minutes on {{ $labels.instance }}.
|
||||
Current utilization: {{ $value | humanizePercentage }}
|
||||
|
||||
This may indicate:
|
||||
- High inference load requiring scale-up
|
||||
- Resource contention
|
||||
- Model optimization needed
|
||||
|
||||
Consider scaling up if this persists.
|
||||
|
||||
- alert: GPUMemoryHigh
|
||||
expr: nvidia_smi_memory_used_bytes / nvidia_smi_memory_total_bytes > 0.95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: infrastructure
|
||||
component: gpu
|
||||
annotations:
|
||||
summary: "GPU memory usage critical on {{ $labels.instance }}"
|
||||
description: |
|
||||
GPU memory usage is critically high: {{ $value | humanizePercentage }}
|
||||
Available memory: {{ (nvidia_smi_memory_total_bytes - nvidia_smi_memory_used_bytes) / 1024 / 1024 / 1024 | printf "%.1f" }} GB
|
||||
|
||||
Immediate action required:
|
||||
- Check for memory leaks
|
||||
- Reduce batch size
|
||||
- Consider model optimization
|
||||
|
||||
- alert: GPUTemperatureHigh
|
||||
expr: nvidia_smi_temperature_gpu > 85
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: infrastructure
|
||||
component: gpu
|
||||
annotations:
|
||||
summary: "GPU temperature high on {{ $labels.instance }}"
|
||||
description: |
|
||||
GPU temperature is {{ $value }}°C (threshold: 85°C)
|
||||
|
||||
Check cooling system and reduce workload if necessary.
|
||||
|
||||
- alert: GPUDown
|
||||
expr: up{job="gex44-gpu"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: infrastructure
|
||||
component: gpu
|
||||
annotations:
|
||||
summary: "GPU server {{ $labels.instance }} is down"
|
||||
description: |
|
||||
GPU metrics are not being collected from {{ $labels.instance }}.
|
||||
|
||||
This could indicate:
|
||||
- Server is down
|
||||
- nvidia-smi-exporter is not running
|
||||
- Network connectivity issues
|
||||
|
||||
Immediate investigation required.
|
||||
|
||||
# vLLM inference alerts
|
||||
- name: inference.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighInferenceLatency
|
||||
expr: histogram_quantile(0.95, rate(vllm_request_duration_seconds_bucket[5m])) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ml-platform
|
||||
component: inference
|
||||
annotations:
|
||||
summary: "High inference latency detected"
|
||||
description: |
|
||||
95th percentile latency is {{ $value | printf "%.2f" }}s (threshold: 2s)
|
||||
|
||||
This affects user experience and may indicate:
|
||||
- Model complexity issues
|
||||
- Resource constraints
|
||||
- Network bottlenecks
|
||||
|
||||
- alert: InferenceErrorRate
|
||||
expr: rate(vllm_requests_total{status!="200"}[5m]) / rate(vllm_requests_total[5m]) > 0.05
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: ml-platform
|
||||
component: inference
|
||||
annotations:
|
||||
summary: "High error rate in inference API"
|
||||
description: |
|
||||
Error rate is {{ $value | humanizePercentage }} (threshold: 5%)
|
||||
|
||||
Check application logs and model health immediately.
|
||||
|
||||
- alert: vLLMServiceDown
|
||||
expr: up{job="vllm-api"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: ml-platform
|
||||
component: inference
|
||||
annotations:
|
||||
summary: "vLLM service down on {{ $labels.instance }}"
|
||||
description: |
|
||||
vLLM API is not responding on {{ $labels.instance }}.
|
||||
|
||||
Service recovery steps:
|
||||
1. Check systemctl status vllm-api
|
||||
2. Check GPU availability
|
||||
3. Review service logs
|
||||
|
||||
- alert: InferenceQueueBacklog
|
||||
expr: vllm_queue_size > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ml-platform
|
||||
component: inference
|
||||
annotations:
|
||||
summary: "Large inference queue on {{ $labels.instance }}"
|
||||
description: |
|
||||
Queue size: {{ $value }} requests (threshold: 50)
|
||||
|
||||
Consider:
|
||||
- Scaling up GPU servers
|
||||
- Optimizing model parameters
|
||||
- Load balancing adjustments
|
||||
|
||||
# Cost optimization alerts
|
||||
- name: cost.rules
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: UnusedGPUCost
|
||||
expr: avg_over_time(nvidia_smi_utilization_gpu_ratio[30m]) < 0.1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: info
|
||||
team: finops
|
||||
component: cost-optimization
|
||||
annotations:
|
||||
summary: "Potentially unused GPU detected"
|
||||
description: |
|
||||
GPU {{ $labels.instance }} has been under 10% utilization for 30 minutes.
|
||||
|
||||
Monthly cost impact: €184
|
||||
|
||||
Consider:
|
||||
- Scheduling workloads more efficiently
|
||||
- Temporary shutdown during low usage
|
||||
- Rightsizing the infrastructure
|
||||
|
||||
- alert: HighCostPerRequest
|
||||
expr: (184 * 3 / 30 / 24) / (sum(rate(vllm_requests_total{status="200"}[1h])) * 3600) > 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: finops
|
||||
component: cost-optimization
|
||||
annotations:
|
||||
summary: "High cost per request detected"
|
||||
description: |
|
||||
Current cost per request: €{{ $value | printf "%.4f" }}
|
||||
Target: <€0.01 per request
|
||||
|
||||
Optimization needed:
|
||||
- Increase request volume
|
||||
- Optimize infrastructure usage
|
||||
- Review pricing model
|
||||
|
||||
# Infrastructure health alerts
|
||||
- name: infrastructure.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: infrastructure
|
||||
component: compute
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: |
|
||||
CPU usage: {{ $value | printf "%.1f" }}%
|
||||
|
||||
Monitor for performance impact on inference.
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: infrastructure
|
||||
component: memory
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: |
|
||||
Memory usage: {{ $value | humanizePercentage }}
|
||||
Available: {{ node_memory_MemAvailable_bytes / 1024 / 1024 / 1024 | printf "%.1f" }} GB
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_free_bytes) / node_filesystem_size_bytes > 0.85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: infrastructure
|
||||
component: storage
|
||||
annotations:
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: |
|
||||
Disk usage: {{ $value | humanizePercentage }}
|
||||
Free space: {{ node_filesystem_free_bytes / 1024 / 1024 / 1024 | printf "%.1f" }} GB
|
||||
|
||||
Clean up logs or expand storage.
|
||||
|
||||
# Load balancer alerts
|
||||
- name: loadbalancer.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: LoadBalancerDown
|
||||
expr: up{job="haproxy"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: infrastructure
|
||||
component: loadbalancer
|
||||
annotations:
|
||||
summary: "Load balancer is down"
|
||||
description: |
|
||||
HAProxy is not responding. All traffic is affected.
|
||||
|
||||
Immediate action required!
|
||||
|
||||
- alert: BackendServerDown
|
||||
expr: haproxy_server_up{backend="vllm_backend"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: infrastructure
|
||||
component: loadbalancer
|
||||
annotations:
|
||||
summary: "Backend server {{ $labels.server }} is down"
|
||||
description: |
|
||||
Server {{ $labels.server }} in backend {{ $labels.backend }} is marked as down.
|
||||
|
||||
Check server health and connectivity.
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: haproxy_backend_response_time_average_seconds{backend="vllm_backend"} > 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: infrastructure
|
||||
component: loadbalancer
|
||||
annotations:
|
||||
summary: "High response time from backend"
|
||||
description: |
|
||||
Average response time: {{ $value | printf "%.2f" }}s
|
||||
|
||||
Check backend server performance.
|
||||
|
||||
# Network and connectivity alerts
|
||||
- name: network.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighNetworkTraffic
|
||||
expr: rate(node_network_receive_bytes_total{device!="lo"}[5m]) > 100 * 1024 * 1024
|
||||
for: 10m
|
||||
labels:
|
||||
severity: info
|
||||
team: infrastructure
|
||||
component: network
|
||||
annotations:
|
||||
summary: "High network traffic on {{ $labels.instance }}"
|
||||
description: |
|
||||
Inbound traffic: {{ $value | humanize }}B/s
|
||||
|
||||
Monitor for potential issues.
|
||||
|
||||
- alert: ServiceUnreachable
|
||||
expr: probe_success{job="blackbox-http"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: infrastructure
|
||||
component: connectivity
|
||||
annotations:
|
||||
summary: "Service {{ $labels.instance }} is unreachable"
|
||||
description: |
|
||||
HTTP probe failed for {{ $labels.instance }}.
|
||||
|
||||
Check service status and network connectivity.
|
||||
|
||||
# Security alerts
|
||||
- name: security.rules
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: SSLCertificateExpiringSoon
|
||||
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 7
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
component: certificates
|
||||
annotations:
|
||||
summary: "SSL certificate expiring soon for {{ $labels.instance }}"
|
||||
description: |
|
||||
Certificate expires in {{ $value | printf "%.0f" }} days.
|
||||
|
||||
Renew certificate before expiration.
|
||||
|
||||
- alert: UnauthorizedAPIAccess
|
||||
expr: increase(vllm_requests_total{status="401"}[5m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
component: authentication
|
||||
annotations:
|
||||
summary: "Multiple unauthorized API access attempts"
|
||||
description: |
|
||||
{{ $value }} unauthorized requests in the last 5 minutes.
|
||||
|
||||
Potential security issue - investigate source.
|
||||
172
monitoring/prometheus/prometheus.yml
Normal file
172
monitoring/prometheus/prometheus.yml
Normal file
@ -0,0 +1,172 @@
|
||||
# Prometheus configuration for AI Infrastructure monitoring
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'ai-infrastructure'
|
||||
environment: 'production'
|
||||
|
||||
# Rule files for alerting
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
- "recording_rules.yml"
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
scrape_interval: 30s
|
||||
|
||||
# GEX44 GPU servers - GPU metrics
|
||||
- job_name: 'gex44-gpu'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '10.0.1.10:9835' # gex44-1 nvidia-smi-exporter
|
||||
- '10.0.1.11:9835' # gex44-2 nvidia-smi-exporter
|
||||
- '10.0.1.12:9835' # gex44-3 nvidia-smi-exporter
|
||||
scrape_interval: 5s
|
||||
scrape_timeout: 4s
|
||||
metrics_path: '/metrics'
|
||||
params:
|
||||
format: ['prometheus']
|
||||
|
||||
# GEX44 GPU servers - System metrics
|
||||
- job_name: 'gex44-system'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '10.0.1.10:9100' # gex44-1 node-exporter
|
||||
- '10.0.1.11:9100' # gex44-2 node-exporter
|
||||
- '10.0.1.12:9100' # gex44-3 node-exporter
|
||||
scrape_interval: 15s
|
||||
|
||||
# vLLM API metrics
|
||||
- job_name: 'vllm-api'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '10.0.1.10:8000' # gex44-1 vLLM API
|
||||
- '10.0.1.11:8000' # gex44-2 vLLM API
|
||||
- '10.0.1.12:8000' # gex44-3 vLLM API
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 10s
|
||||
scrape_timeout: 8s
|
||||
|
||||
# vLLM custom metrics exporter
|
||||
- job_name: 'vllm-metrics'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '10.0.1.10:9000' # gex44-1 vLLM metrics
|
||||
- '10.0.1.11:9000' # gex44-2 vLLM metrics
|
||||
- '10.0.1.12:9000' # gex44-3 vLLM metrics
|
||||
scrape_interval: 5s
|
||||
|
||||
# HAProxy load balancer
|
||||
- job_name: 'haproxy'
|
||||
static_configs:
|
||||
- targets: ['10.0.2.10:8404']
|
||||
metrics_path: '/stats/prometheus'
|
||||
scrape_interval: 10s
|
||||
|
||||
# Cloud servers - System metrics
|
||||
- job_name: 'cloud-servers'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '10.0.2.10:9100' # load-balancer node-exporter
|
||||
- '10.0.2.11:9100' # api-gateway node-exporter
|
||||
- '10.0.2.12:9100' # monitoring node-exporter
|
||||
scrape_interval: 15s
|
||||
|
||||
# API Gateway (nginx)
|
||||
- job_name: 'api-gateway'
|
||||
static_configs:
|
||||
- targets: ['10.0.2.11:9113'] # nginx-prometheus-exporter
|
||||
scrape_interval: 15s
|
||||
|
||||
# Custom business metrics
|
||||
- job_name: 'business-metrics'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '10.0.2.10:9001' # cost-tracker
|
||||
- '10.0.2.11:9002' # api-analytics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Docker containers (if used)
|
||||
- job_name: 'docker'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '10.0.1.10:9323' # gex44-1 docker metrics
|
||||
- '10.0.1.11:9323' # gex44-2 docker metrics
|
||||
- '10.0.1.12:9323' # gex44-3 docker metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# Blackbox monitoring for external endpoints
|
||||
- job_name: 'blackbox-http'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://10.0.2.10/health # Load balancer health
|
||||
- http://10.0.1.10:8000/health # gex44-1 vLLM health
|
||||
- http://10.0.1.11:8000/health # gex44-2 vLLM health
|
||||
- http://10.0.1.12:8000/health # gex44-3 vLLM health
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: 10.0.2.12:9115 # blackbox exporter address
|
||||
|
||||
# SSL certificate monitoring
|
||||
- job_name: 'ssl-certificates'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [tls_connect]
|
||||
static_configs:
|
||||
- targets:
|
||||
- api.yourdomain.com:443
|
||||
- monitoring.yourdomain.com:443
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: 10.0.2.12:9115
|
||||
|
||||
# AlertManager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- "alertmanager:9093"
|
||||
path_prefix: /
|
||||
|
||||
# Remote write configuration (for long-term storage)
|
||||
remote_write:
|
||||
- url: "http://victoriametrics:8428/api/v1/write"
|
||||
queue_config:
|
||||
max_samples_per_send: 10000
|
||||
batch_send_deadline: 5s
|
||||
max_shards: 200
|
||||
write_relabel_configs:
|
||||
# Keep only essential metrics for long-term storage
|
||||
- source_labels: [__name__]
|
||||
regex: '(nvidia_smi_.*|vllm_.*|haproxy_.*|up|node_.*cpu.*|node_.*memory.*|node_disk_.*)'
|
||||
action: keep
|
||||
|
||||
# Storage configuration
|
||||
storage:
|
||||
tsdb:
|
||||
retention.time: 30d
|
||||
retention.size: 50GB
|
||||
path: /prometheus/data
|
||||
wal-compression: true
|
||||
|
||||
# Performance optimizations
|
||||
query:
|
||||
max_concurrency: 20
|
||||
timeout: 2m
|
||||
max_samples: 50000000
|
||||
447
scripts/cost-analysis.py
Normal file
447
scripts/cost-analysis.py
Normal file
@ -0,0 +1,447 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cost Analysis Script for AI Infrastructure
|
||||
Provides detailed cost breakdown and optimization recommendations.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Dict, List, Optional
|
||||
import requests
|
||||
|
||||
|
||||
@dataclass
|
||||
class CostBreakdown:
|
||||
"""Cost breakdown structure"""
|
||||
hetzner_servers: float
|
||||
hetzner_cloud: float
|
||||
bandwidth: float
|
||||
storage: float
|
||||
tools_and_licenses: float
|
||||
operational_time: float
|
||||
|
||||
@property
|
||||
def total_monthly(self) -> float:
|
||||
return (self.hetzner_servers + self.hetzner_cloud +
|
||||
self.bandwidth + self.storage +
|
||||
self.tools_and_licenses + self.operational_time)
|
||||
|
||||
|
||||
class CostAnalyzer:
|
||||
"""Main cost analysis class"""
|
||||
|
||||
def __init__(self, environment: str = "production"):
|
||||
self.environment = environment
|
||||
self.hcloud_token = os.getenv('HCLOUD_TOKEN')
|
||||
self.prometheus_url = os.getenv('PROMETHEUS_URL', 'http://localhost:9090')
|
||||
|
||||
# Current pricing (EUR)
|
||||
self.pricing = {
|
||||
'gex44_monthly': 184.00,
|
||||
'cx31_monthly': 22.68,
|
||||
'cx21_monthly': 11.76,
|
||||
'cx11_monthly': 4.90,
|
||||
'storage_gb_monthly': 0.05,
|
||||
'backup_gb_monthly': 0.012,
|
||||
'bandwidth_gb': 0.00, # Free in Germany
|
||||
'gitlab_premium_monthly': 29.00,
|
||||
'devops_hourly': 50.00
|
||||
}
|
||||
|
||||
def get_infrastructure_costs(self) -> CostBreakdown:
|
||||
"""Calculate current infrastructure costs"""
|
||||
|
||||
# Get server counts from Hetzner API or configuration
|
||||
server_counts = self._get_server_counts()
|
||||
|
||||
# Calculate costs
|
||||
hetzner_servers = server_counts['gex44'] * self.pricing['gex44_monthly']
|
||||
|
||||
hetzner_cloud = (
|
||||
server_counts['cx31'] * self.pricing['cx31_monthly'] +
|
||||
server_counts['cx21'] * self.pricing['cx21_monthly'] +
|
||||
server_counts['cx11'] * self.pricing['cx11_monthly']
|
||||
)
|
||||
|
||||
storage = server_counts['storage_gb'] * self.pricing['storage_gb_monthly']
|
||||
bandwidth = 0 # Free within Germany
|
||||
tools_and_licenses = self.pricing['gitlab_premium_monthly']
|
||||
|
||||
# Operational time (10 hours/week maintenance)
|
||||
operational_time = 10 * 4 * self.pricing['devops_hourly'] # Monthly
|
||||
|
||||
return CostBreakdown(
|
||||
hetzner_servers=hetzner_servers,
|
||||
hetzner_cloud=hetzner_cloud,
|
||||
bandwidth=bandwidth,
|
||||
storage=storage,
|
||||
tools_and_licenses=tools_and_licenses,
|
||||
operational_time=operational_time
|
||||
)
|
||||
|
||||
def _get_server_counts(self) -> Dict[str, int]:
|
||||
"""Get current server counts from various sources"""
|
||||
counts = {
|
||||
'gex44': 3, # Default
|
||||
'cx31': 2, # LB + API Gateway
|
||||
'cx21': 1, # Monitoring
|
||||
'cx11': 0,
|
||||
'storage_gb': 500
|
||||
}
|
||||
|
||||
# Try to get actual counts from Hetzner API
|
||||
if self.hcloud_token:
|
||||
try:
|
||||
counts.update(self._get_hcloud_server_counts())
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not fetch Hetzner Cloud data: {e}")
|
||||
|
||||
# Try to get GEX44 count from Prometheus
|
||||
try:
|
||||
gex44_count = self._get_prometheus_server_count()
|
||||
if gex44_count:
|
||||
counts['gex44'] = gex44_count
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not fetch Prometheus data: {e}")
|
||||
|
||||
return counts
|
||||
|
||||
def _get_hcloud_server_counts(self) -> Dict[str, int]:
|
||||
"""Get server counts from Hetzner Cloud API"""
|
||||
headers = {'Authorization': f'Bearer {self.hcloud_token}'}
|
||||
response = requests.get('https://api.hetzner.cloud/v1/servers', headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
servers = response.json()['servers']
|
||||
counts = {'cx31': 0, 'cx21': 0, 'cx11': 0}
|
||||
storage_gb = 0
|
||||
|
||||
for server in servers:
|
||||
if server['status'] == 'running':
|
||||
server_type = server['server_type']['name']
|
||||
if server_type in counts:
|
||||
counts[server_type] += 1
|
||||
|
||||
# Get volumes
|
||||
response = requests.get('https://api.hetzner.cloud/v1/volumes', headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
volumes = response.json()['volumes']
|
||||
for volume in volumes:
|
||||
storage_gb += volume['size']
|
||||
|
||||
counts['storage_gb'] = storage_gb
|
||||
return counts
|
||||
|
||||
def _get_prometheus_server_count(self) -> Optional[int]:
|
||||
"""Get GEX44 server count from Prometheus"""
|
||||
query = 'count(up{job="gex44-gpu"})'
|
||||
response = requests.get(
|
||||
f'{self.prometheus_url}/api/v1/query',
|
||||
params={'query': query}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data['data']['result']:
|
||||
return int(data['data']['result'][0]['value'][1])
|
||||
|
||||
return None
|
||||
|
||||
def get_usage_metrics(self) -> Dict[str, float]:
|
||||
"""Get infrastructure usage metrics from Prometheus"""
|
||||
metrics = {}
|
||||
|
||||
queries = {
|
||||
'avg_gpu_utilization': 'avg(nvidia_smi_utilization_gpu_ratio)',
|
||||
'avg_cpu_utilization': 'avg(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100))',
|
||||
'avg_memory_utilization': 'avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)',
|
||||
'requests_per_hour': 'sum(rate(vllm_requests_total[1h])) * 3600',
|
||||
'tokens_per_hour': 'sum(rate(vllm_tokens_generated_total[1h])) * 3600'
|
||||
}
|
||||
|
||||
for metric_name, query in queries.items():
|
||||
try:
|
||||
response = requests.get(
|
||||
f'{self.prometheus_url}/api/v1/query',
|
||||
params={'query': query}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data['data']['result']:
|
||||
metrics[metric_name] = float(data['data']['result'][0]['value'][1])
|
||||
else:
|
||||
metrics[metric_name] = 0.0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not fetch {metric_name}: {e}")
|
||||
metrics[metric_name] = 0.0
|
||||
|
||||
return metrics
|
||||
|
||||
def calculate_cost_per_request(self, monthly_cost: float, requests_per_hour: float) -> float:
|
||||
"""Calculate cost per request"""
|
||||
if requests_per_hour == 0:
|
||||
return 0.0
|
||||
|
||||
monthly_requests = requests_per_hour * 24 * 30
|
||||
return monthly_cost / monthly_requests
|
||||
|
||||
def calculate_efficiency_score(self, metrics: Dict[str, float]) -> float:
|
||||
"""Calculate overall efficiency score (0-100)"""
|
||||
gpu_efficiency = metrics.get('avg_gpu_utilization', 0) * 100
|
||||
cpu_efficiency = min(metrics.get('avg_cpu_utilization', 0), 80) / 80 * 100 # Cap at 80%
|
||||
memory_efficiency = min(metrics.get('avg_memory_utilization', 0), 85) / 85 * 100 # Cap at 85%
|
||||
|
||||
# Weighted average
|
||||
return (gpu_efficiency * 0.5 + cpu_efficiency * 0.3 + memory_efficiency * 0.2)
|
||||
|
||||
def get_optimization_recommendations(self, costs: CostBreakdown, metrics: Dict[str, float]) -> List[str]:
|
||||
"""Generate cost optimization recommendations"""
|
||||
recommendations = []
|
||||
|
||||
efficiency_score = self.calculate_efficiency_score(metrics)
|
||||
gpu_utilization = metrics.get('avg_gpu_utilization', 0)
|
||||
|
||||
# GPU utilization recommendations
|
||||
if gpu_utilization < 0.3:
|
||||
savings = costs.hetzner_servers * 0.33 # 1 server
|
||||
recommendations.append(
|
||||
f"LOW GPU UTILIZATION ({gpu_utilization:.1%}): Consider reducing GPU servers by 1. "
|
||||
f"Potential savings: €{savings:.2f}/month"
|
||||
)
|
||||
elif gpu_utilization > 0.8:
|
||||
cost_increase = self.pricing['gex44_monthly']
|
||||
recommendations.append(
|
||||
f"HIGH GPU UTILIZATION ({gpu_utilization:.1%}): Consider adding 1 more GPU server. "
|
||||
f"Additional cost: €{cost_increase:.2f}/month"
|
||||
)
|
||||
|
||||
# Cloud server optimization
|
||||
if metrics.get('avg_cpu_utilization', 0) < 0.3:
|
||||
recommendations.append(
|
||||
"LOW CPU UTILIZATION: Consider downgrading cloud server types (cx31 → cx21)"
|
||||
)
|
||||
|
||||
# Storage optimization
|
||||
if costs.storage > 50: # More than €50/month on storage
|
||||
recommendations.append(
|
||||
"HIGH STORAGE COSTS: Review storage usage and implement automated cleanup"
|
||||
)
|
||||
|
||||
# Operational efficiency
|
||||
if efficiency_score < 60:
|
||||
recommendations.append(
|
||||
f"LOW EFFICIENCY SCORE ({efficiency_score:.1f}/100): "
|
||||
"Review resource allocation and workload distribution"
|
||||
)
|
||||
|
||||
# Request efficiency
|
||||
cost_per_request = self.calculate_cost_per_request(
|
||||
costs.total_monthly,
|
||||
metrics.get('requests_per_hour', 0)
|
||||
)
|
||||
|
||||
if cost_per_request > 0.005: # More than €0.005 per request
|
||||
recommendations.append(
|
||||
f"HIGH COST PER REQUEST (€{cost_per_request:.4f}): "
|
||||
"Optimize request batching or increase utilization"
|
||||
)
|
||||
|
||||
return recommendations
|
||||
|
||||
def compare_alternatives(self, costs: CostBreakdown) -> Dict[str, Dict]:
|
||||
"""Compare costs with cloud alternatives"""
|
||||
|
||||
# AWS equivalent (p4d.xlarge with 40GB A100)
|
||||
aws_gpu_hourly = 4.50 # USD, convert to EUR (~0.85 rate)
|
||||
aws_monthly = aws_gpu_hourly * 24 * 30 * 0.85 * 3 # 3 instances
|
||||
aws_cloud_services = 850 * 0.85 # Support services
|
||||
aws_total = aws_monthly + aws_cloud_services
|
||||
|
||||
# Azure equivalent (NC24ads A100 v4)
|
||||
azure_gpu_hourly = 3.67 # USD
|
||||
azure_monthly = azure_gpu_hourly * 24 * 30 * 0.85 * 3
|
||||
azure_cloud_services = 780 * 0.85
|
||||
azure_total = azure_monthly + azure_cloud_services
|
||||
|
||||
return {
|
||||
'hetzner': {
|
||||
'monthly_cost': costs.total_monthly,
|
||||
'cost_per_gpu': costs.hetzner_servers / 3,
|
||||
'performance_ratio': 1.0 # Baseline
|
||||
},
|
||||
'aws': {
|
||||
'monthly_cost': aws_total,
|
||||
'cost_per_gpu': aws_monthly / 3,
|
||||
'performance_ratio': 1.4, # A100 ~40% faster than RTX 4000 Ada
|
||||
'cost_efficiency': costs.total_monthly / (aws_total / 1.4)
|
||||
},
|
||||
'azure': {
|
||||
'monthly_cost': azure_total,
|
||||
'cost_per_gpu': azure_monthly / 3,
|
||||
'performance_ratio': 1.4,
|
||||
'cost_efficiency': costs.total_monthly / (azure_total / 1.4)
|
||||
}
|
||||
}
|
||||
|
||||
def generate_report(self, format_type: str = "markdown") -> str:
|
||||
"""Generate comprehensive cost analysis report"""
|
||||
costs = self.get_infrastructure_costs()
|
||||
metrics = self.get_usage_metrics()
|
||||
recommendations = self.get_optimization_recommendations(costs, metrics)
|
||||
alternatives = self.compare_alternatives(costs)
|
||||
|
||||
if format_type == "json":
|
||||
return json.dumps({
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'environment': self.environment,
|
||||
'costs': asdict(costs),
|
||||
'metrics': metrics,
|
||||
'recommendations': recommendations,
|
||||
'alternatives': alternatives,
|
||||
'efficiency_score': self.calculate_efficiency_score(metrics)
|
||||
}, indent=2)
|
||||
|
||||
elif format_type == "markdown":
|
||||
return self._generate_markdown_report(costs, metrics, recommendations, alternatives)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported format: {format_type}")
|
||||
|
||||
def _generate_markdown_report(self, costs: CostBreakdown, metrics: Dict[str, float],
|
||||
recommendations: List[str], alternatives: Dict[str, Dict]) -> str:
|
||||
"""Generate markdown report"""
|
||||
|
||||
efficiency_score = self.calculate_efficiency_score(metrics)
|
||||
cost_per_request = self.calculate_cost_per_request(
|
||||
costs.total_monthly,
|
||||
metrics.get('requests_per_hour', 0)
|
||||
)
|
||||
|
||||
report = f"""# Cost Analysis Report - {self.environment.title()}
|
||||
*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
|
||||
|
||||
## Executive Summary
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| **Total Monthly Cost** | €{costs.total_monthly:.2f} |
|
||||
| **Cost per Request** | €{cost_per_request:.4f} |
|
||||
| **Efficiency Score** | {efficiency_score:.1f}/100 |
|
||||
| **GPU Utilization** | {metrics.get('avg_gpu_utilization', 0):.1%} |
|
||||
|
||||
## Cost Breakdown
|
||||
|
||||
| Component | Monthly Cost | Percentage |
|
||||
|-----------|--------------|------------|
|
||||
| GPU Servers (GEX44) | €{costs.hetzner_servers:.2f} | {costs.hetzner_servers/costs.total_monthly*100:.1f}% |
|
||||
| Cloud Servers | €{costs.hetzner_cloud:.2f} | {costs.hetzner_cloud/costs.total_monthly*100:.1f}% |
|
||||
| Storage | €{costs.storage:.2f} | {costs.storage/costs.total_monthly*100:.1f}% |
|
||||
| Tools & Licenses | €{costs.tools_and_licenses:.2f} | {costs.tools_and_licenses/costs.total_monthly*100:.1f}% |
|
||||
| Operational Time | €{costs.operational_time:.2f} | {costs.operational_time/costs.total_monthly*100:.1f}% |
|
||||
| **Total** | **€{costs.total_monthly:.2f}** | **100%** |
|
||||
|
||||
## Performance Metrics
|
||||
|
||||
| Metric | Current Value |
|
||||
|--------|---------------|
|
||||
| Average GPU Utilization | {metrics.get('avg_gpu_utilization', 0):.1%} |
|
||||
| Average CPU Utilization | {metrics.get('avg_cpu_utilization', 0):.1%} |
|
||||
| Average Memory Utilization | {metrics.get('avg_memory_utilization', 0):.1%} |
|
||||
| Requests per Hour | {metrics.get('requests_per_hour', 0):.0f} |
|
||||
| Tokens per Hour | {metrics.get('tokens_per_hour', 0):.0f} |
|
||||
|
||||
## Cloud Provider Comparison
|
||||
|
||||
| Provider | Monthly Cost | Cost vs Hetzner | Performance Ratio | Cost Efficiency |
|
||||
|----------|--------------|-----------------|-------------------|-----------------|
|
||||
| **Hetzner** | €{alternatives['hetzner']['monthly_cost']:.2f} | Baseline | 1.0x | 1.0x |
|
||||
| AWS | €{alternatives['aws']['monthly_cost']:.2f} | +{(alternatives['aws']['monthly_cost']/alternatives['hetzner']['monthly_cost']-1)*100:.0f}% | {alternatives['aws']['performance_ratio']:.1f}x | {alternatives['aws']['cost_efficiency']:.1f}x |
|
||||
| Azure | €{alternatives['azure']['monthly_cost']:.2f} | +{(alternatives['azure']['monthly_cost']/alternatives['hetzner']['monthly_cost']-1)*100:.0f}% | {alternatives['azure']['performance_ratio']:.1f}x | {alternatives['azure']['cost_efficiency']:.1f}x |
|
||||
|
||||
## Optimization Recommendations
|
||||
|
||||
"""
|
||||
|
||||
if recommendations:
|
||||
for i, rec in enumerate(recommendations, 1):
|
||||
report += f"{i}. {rec}\n"
|
||||
else:
|
||||
report += "✅ No immediate optimization opportunities identified.\n"
|
||||
|
||||
report += f"""
|
||||
## Cost Trends
|
||||
|
||||
*Note: Implement trend tracking by running this report regularly*
|
||||
|
||||
## Action Items
|
||||
|
||||
### Immediate (This Week)
|
||||
- Review GPU utilization patterns
|
||||
- Implement automated scaling policies
|
||||
- Optimize model loading and caching
|
||||
|
||||
### Short Term (This Month)
|
||||
- Analyze usage patterns for better capacity planning
|
||||
- Implement cost alerting thresholds
|
||||
- Review and optimize storage usage
|
||||
|
||||
### Long Term (Next Quarter)
|
||||
- Evaluate upgrade path to newer hardware
|
||||
- Consider multi-region deployment for optimization
|
||||
- Implement advanced cost allocation tracking
|
||||
|
||||
## Contact
|
||||
|
||||
For questions about this cost analysis, contact the Infrastructure Team.
|
||||
|
||||
---
|
||||
*Report generated by AI Infrastructure Cost Analyzer v1.0*
|
||||
"""
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='AI Infrastructure Cost Analysis')
|
||||
parser.add_argument('--environment', '-e', default='production',
|
||||
help='Environment to analyze (default: production)')
|
||||
parser.add_argument('--format', '-f', choices=['markdown', 'json'], default='markdown',
|
||||
help='Output format (default: markdown)')
|
||||
parser.add_argument('--output', '-o', help='Output file (default: stdout)')
|
||||
parser.add_argument('--find-unused', action='store_true',
|
||||
help='Find unused resources for cleanup')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
analyzer = CostAnalyzer(args.environment)
|
||||
|
||||
if args.find_unused:
|
||||
# Special mode to find unused resources
|
||||
print("Scanning for unused resources...")
|
||||
# Implementation for finding unused resources
|
||||
sys.exit(0)
|
||||
|
||||
report = analyzer.generate_report(args.format)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, 'w') as f:
|
||||
f.write(report)
|
||||
print(f"Report written to {args.output}")
|
||||
else:
|
||||
print(report)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating cost analysis: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
98
terraform/main.tf
Normal file
98
terraform/main.tf
Normal file
@ -0,0 +1,98 @@
|
||||
# Main Terraform configuration for AI Infrastructure
|
||||
terraform {
|
||||
required_version = ">= 1.5"
|
||||
required_providers {
|
||||
hcloud = {
|
||||
source = "hetznercloud/hcloud"
|
||||
version = "~> 1.45"
|
||||
}
|
||||
random = {
|
||||
source = "hashicorp/random"
|
||||
version = "~> 3.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Provider configuration
|
||||
provider "hcloud" {
|
||||
token = var.hcloud_token
|
||||
}
|
||||
|
||||
# Data sources
|
||||
data "hcloud_ssh_key" "main" {
|
||||
name = var.ssh_key_name
|
||||
}
|
||||
|
||||
# Base infrastructure
|
||||
module "hcloud_base" {
|
||||
source = "./modules/hcloud-base"
|
||||
|
||||
environment = var.environment
|
||||
ssh_public_key = var.ssh_public_key
|
||||
ssh_key_name = var.ssh_key_name
|
||||
network_zone = var.network_zone
|
||||
private_network_cidr = var.private_network_cidr
|
||||
gex44_subnet = var.gex44_subnet
|
||||
cloud_subnet = var.cloud_subnet
|
||||
allowed_ssh_cidrs = var.allowed_ssh_cidrs
|
||||
}
|
||||
|
||||
# Load balancer
|
||||
module "load_balancer" {
|
||||
source = "./modules/load-balancer"
|
||||
|
||||
environment = var.environment
|
||||
network_id = module.hcloud_base.network_id
|
||||
ssh_key_name = module.hcloud_base.ssh_key_name
|
||||
subnet_id = module.hcloud_base.cloud_subnet_id
|
||||
|
||||
gex44_ips = [
|
||||
"10.0.1.10", # GEX44-1
|
||||
"10.0.1.11", # GEX44-2
|
||||
"10.0.1.12" # GEX44-3
|
||||
]
|
||||
|
||||
depends_on = [module.hcloud_base]
|
||||
}
|
||||
|
||||
# API Gateway
|
||||
module "api_gateway" {
|
||||
source = "./modules/api-gateway"
|
||||
|
||||
environment = var.environment
|
||||
network_id = module.hcloud_base.network_id
|
||||
ssh_key_name = module.hcloud_base.ssh_key_name
|
||||
subnet_id = module.hcloud_base.cloud_subnet_id
|
||||
lb_ip = module.load_balancer.private_ip
|
||||
|
||||
depends_on = [module.hcloud_base, module.load_balancer]
|
||||
}
|
||||
|
||||
# Monitoring stack
|
||||
module "monitoring" {
|
||||
source = "./modules/monitoring"
|
||||
|
||||
environment = var.environment
|
||||
network_id = module.hcloud_base.network_id
|
||||
ssh_key_name = module.hcloud_base.ssh_key_name
|
||||
subnet_id = module.hcloud_base.cloud_subnet_id
|
||||
retention_days = var.monitoring_retention_days
|
||||
grafana_admin_password = var.grafana_admin_password
|
||||
|
||||
depends_on = [module.hcloud_base]
|
||||
}
|
||||
|
||||
# GEX44 configuration helpers
|
||||
module "gex44_config" {
|
||||
source = "./modules/gex44-config"
|
||||
|
||||
environment = var.environment
|
||||
gex44_count = var.gex44_count
|
||||
network_id = module.hcloud_base.network_id
|
||||
ssh_key_name = module.hcloud_base.ssh_key_name
|
||||
ansible_repo_url = var.ansible_repo_url
|
||||
gitlab_token = var.gitlab_deploy_token
|
||||
vault_password = var.vault_password
|
||||
|
||||
depends_on = [module.hcloud_base]
|
||||
}
|
||||
164
terraform/modules/ansible-inventory/main.tf
Normal file
164
terraform/modules/ansible-inventory/main.tf
Normal file
@ -0,0 +1,164 @@
|
||||
# terraform/modules/ansible-inventory/main.tf
|
||||
# Generate Ansible inventory directly from Terraform
|
||||
|
||||
locals {
|
||||
# Load environment requirements
|
||||
requirements = yamldecode(file("${path.root}/../../inventories/${var.environment}/requirements.yml"))
|
||||
|
||||
# Generate inventory structure
|
||||
inventory = {
|
||||
all = {
|
||||
vars = {
|
||||
environment = var.environment
|
||||
os_family = "ubuntu"
|
||||
os_version = "24.04"
|
||||
ansible_user = "ubuntu"
|
||||
python_interpreter = "/usr/bin/python3"
|
||||
ansible_ssh_private_key_file = "~/.ssh/hetzner-${var.environment}"
|
||||
}
|
||||
children = merge(
|
||||
var.environment == "development" ? {
|
||||
dev_servers = {
|
||||
hosts = var.dev_servers != null ? {
|
||||
for server in var.dev_servers : server.name => {
|
||||
ansible_host = server.ipv4_address
|
||||
private_ip = server.private_ip
|
||||
cpu_only = true
|
||||
vllm_port = 8000
|
||||
os_image = "ubuntu-24.04"
|
||||
}
|
||||
} : {}
|
||||
vars = {
|
||||
docker_version = "24.0.*"
|
||||
vllm_version = "latest"
|
||||
model_config = local.requirements.models
|
||||
gpu_simulation = true
|
||||
ubuntu_version = "24.04"
|
||||
}
|
||||
}
|
||||
} : {},
|
||||
|
||||
length(var.gex44_servers) > 0 ? {
|
||||
gex44_${var.environment} = {
|
||||
hosts = {
|
||||
for i, server in var.gex44_servers : server.name => {
|
||||
ansible_host = server.ipv4_address
|
||||
private_ip = server.private_ip
|
||||
gpu_type = try(local.requirements.infrastructure.specifications[i].gpu, "RTX_4000_Ada_20GB")
|
||||
cpu_type = try(local.requirements.infrastructure.specifications[i].cpu, "Intel_i5_13500")
|
||||
ram_gb = try(local.requirements.infrastructure.specifications[i].ram, 64)
|
||||
nvme_config = try(local.requirements.infrastructure.specifications[i].nvme, "2x1TB")
|
||||
vllm_port = 8000
|
||||
metrics_port = 9400
|
||||
cuda_visible_devices = "0"
|
||||
os_image = "ubuntu-24.04"
|
||||
}
|
||||
}
|
||||
vars = {
|
||||
nvidia_driver_version = "545.23.08"
|
||||
docker_version = "24.0.*"
|
||||
vllm_version = "latest"
|
||||
model_config = local.requirements.models
|
||||
scaling_config = local.requirements.scaling
|
||||
ubuntu_version = "24.04"
|
||||
}
|
||||
}
|
||||
} : {},
|
||||
|
||||
var.load_balancers != null ? {
|
||||
load_balancer = {
|
||||
hosts = {
|
||||
for i, lb in var.load_balancers : lb.name => {
|
||||
ansible_host = lb.ipv4_address
|
||||
private_ip = lb.private_ip
|
||||
role = i == 0 ? "primary" : "backup"
|
||||
haproxy_priority = 100 - (i * 10)
|
||||
}
|
||||
}
|
||||
vars = {
|
||||
haproxy_backend_servers = [for server in var.gex44_servers : server.private_ip]
|
||||
ssl_certificate_type = try(local.requirements.security.ssl_certificate, "letsencrypt")
|
||||
environment_config = local.requirements
|
||||
}
|
||||
}
|
||||
} : {},
|
||||
|
||||
var.monitoring_server != null ? {
|
||||
monitoring = {
|
||||
hosts = {
|
||||
"monitoring-${var.environment}" = {
|
||||
ansible_host = var.monitoring_server.ipv4_address
|
||||
private_ip = var.monitoring_server.private_ip
|
||||
prometheus_retention = try(local.requirements.integrations.monitoring.prometheus_retention, "30d")
|
||||
alert_severity = try(local.requirements.integrations.monitoring.alert_severity, "warning")
|
||||
os_image = "ubuntu-24.04"
|
||||
}
|
||||
}
|
||||
vars = {
|
||||
prometheus_version = "2.47.2"
|
||||
grafana_version = "10.2.0"
|
||||
alertmanager_version = "0.26.0"
|
||||
ubuntu_version = "24.04"
|
||||
}
|
||||
}
|
||||
} : {}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Generate YAML inventory file
|
||||
resource "local_file" "ansible_inventory" {
|
||||
content = yamlencode(local.inventory)
|
||||
filename = "${path.root}/../../inventories/${var.environment}/hosts.yml"
|
||||
|
||||
depends_on = [var.servers_ready]
|
||||
}
|
||||
|
||||
# Generate SSH config
|
||||
resource "local_file" "ssh_config" {
|
||||
content = templatefile("${path.module}/ssh_config.tftpl", {
|
||||
environment = var.environment
|
||||
hosts = merge(
|
||||
var.dev_servers != null ? {
|
||||
for server in var.dev_servers : server.name => {
|
||||
ip = server.ipv4_address
|
||||
group = "dev_servers"
|
||||
}
|
||||
} : {},
|
||||
{
|
||||
for server in var.gex44_servers : server.name => {
|
||||
ip = server.ipv4_address
|
||||
group = "gex44_${var.environment}"
|
||||
}
|
||||
},
|
||||
var.load_balancers != null ? {
|
||||
for lb in var.load_balancers : lb.name => {
|
||||
ip = lb.ipv4_address
|
||||
group = "load_balancer"
|
||||
}
|
||||
} : {},
|
||||
var.monitoring_server != null ? {
|
||||
"monitoring-${var.environment}" = {
|
||||
ip = var.monitoring_server.ipv4_address
|
||||
group = "monitoring"
|
||||
}
|
||||
} : {}
|
||||
)
|
||||
})
|
||||
filename = "${path.root}/../../inventories/${var.environment}/ssh_config"
|
||||
}
|
||||
|
||||
# Generate Ansible group_vars
|
||||
resource "local_file" "group_vars" {
|
||||
for_each = local.inventory.all.children
|
||||
|
||||
content = yamlencode(each.value.vars)
|
||||
filename = "${path.root}/../../ansible/group_vars/${each.key}.yml"
|
||||
}
|
||||
|
||||
# Output inventory for verification
|
||||
output "inventory_preview" {
|
||||
value = local.inventory
|
||||
description = "Generated Ansible inventory structure"
|
||||
}
|
||||
15
terraform/modules/ansible-inventory/ssh_config.tftpl
Normal file
15
terraform/modules/ansible-inventory/ssh_config.tftpl
Normal file
@ -0,0 +1,15 @@
|
||||
# SSH Config for ${environment} environment
|
||||
# Generated automatically by Terraform - do not edit manually
|
||||
|
||||
%{ for host_name, host_data in hosts ~}
|
||||
Host ${host_name}
|
||||
HostName ${host_data.ip}
|
||||
User ubuntu
|
||||
IdentityFile ~/.ssh/hetzner-${environment}
|
||||
StrictHostKeyChecking no
|
||||
UserKnownHostsFile /dev/null
|
||||
# Environment: ${environment}
|
||||
# Group: ${host_data.group}
|
||||
# OS: Ubuntu 24.04
|
||||
|
||||
%{ endfor ~}
|
||||
52
terraform/modules/ansible-inventory/variables.tf
Normal file
52
terraform/modules/ansible-inventory/variables.tf
Normal file
@ -0,0 +1,52 @@
|
||||
# terraform/modules/ansible-inventory/variables.tf
|
||||
|
||||
variable "environment" {
|
||||
description = "Environment name (development, staging, production)"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "gex44_servers" {
|
||||
description = "List of GEX44 servers from dedicated server provisioning"
|
||||
type = list(object({
|
||||
name = string
|
||||
ipv4_address = string
|
||||
private_ip = string
|
||||
}))
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "dev_servers" {
|
||||
description = "List of development servers (CPU-only)"
|
||||
type = list(object({
|
||||
name = string
|
||||
ipv4_address = string
|
||||
private_ip = string
|
||||
}))
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "load_balancers" {
|
||||
description = "List of load balancer servers"
|
||||
type = list(object({
|
||||
name = string
|
||||
ipv4_address = string
|
||||
private_ip = string
|
||||
}))
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "monitoring_server" {
|
||||
description = "Monitoring server details"
|
||||
type = object({
|
||||
name = string
|
||||
ipv4_address = string
|
||||
private_ip = string
|
||||
})
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "servers_ready" {
|
||||
description = "Dependency to ensure servers are provisioned before inventory generation"
|
||||
type = any
|
||||
default = null
|
||||
}
|
||||
270
terraform/modules/hcloud-base/main.tf
Normal file
270
terraform/modules/hcloud-base/main.tf
Normal file
@ -0,0 +1,270 @@
|
||||
# Base Hetzner Cloud infrastructure module
|
||||
|
||||
# SSH Key management
|
||||
resource "hcloud_ssh_key" "main" {
|
||||
count = var.ssh_key_name != null ? 1 : 0
|
||||
name = var.ssh_key_name
|
||||
public_key = var.ssh_public_key
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
}
|
||||
}
|
||||
|
||||
data "hcloud_ssh_key" "existing" {
|
||||
count = var.ssh_key_name != null ? 0 : 1
|
||||
name = "default"
|
||||
}
|
||||
|
||||
locals {
|
||||
ssh_key_id = var.ssh_key_name != null ? hcloud_ssh_key.main[0].id : data.hcloud_ssh_key.existing[0].id
|
||||
ssh_key_name = var.ssh_key_name != null ? hcloud_ssh_key.main[0].name : data.hcloud_ssh_key.existing[0].name
|
||||
}
|
||||
|
||||
# Private network for all infrastructure
|
||||
resource "hcloud_network" "main" {
|
||||
name = "${var.environment}-ai-network"
|
||||
ip_range = var.private_network_cidr
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
}
|
||||
}
|
||||
|
||||
# Subnet for GEX44 dedicated servers
|
||||
resource "hcloud_network_subnet" "gex44" {
|
||||
network_id = hcloud_network.main.id
|
||||
type = "cloud"
|
||||
network_zone = var.network_zone
|
||||
ip_range = var.gex44_subnet
|
||||
}
|
||||
|
||||
# Subnet for cloud servers
|
||||
resource "hcloud_network_subnet" "cloud" {
|
||||
network_id = hcloud_network.main.id
|
||||
type = "cloud"
|
||||
network_zone = var.network_zone
|
||||
ip_range = var.cloud_subnet
|
||||
}
|
||||
|
||||
# Firewall for SSH access
|
||||
resource "hcloud_firewall" "ssh" {
|
||||
name = "${var.environment}-ssh-firewall"
|
||||
|
||||
dynamic "rule" {
|
||||
for_each = var.allowed_ssh_cidrs
|
||||
content {
|
||||
direction = "in"
|
||||
port = "22"
|
||||
protocol = "tcp"
|
||||
source_ips = [rule.value]
|
||||
description = "SSH access from ${rule.value}"
|
||||
}
|
||||
}
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
type = "ssh"
|
||||
}
|
||||
}
|
||||
|
||||
# Firewall for HTTP/HTTPS access
|
||||
resource "hcloud_firewall" "web" {
|
||||
name = "${var.environment}-web-firewall"
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "80"
|
||||
protocol = "tcp"
|
||||
source_ips = ["0.0.0.0/0", "::/0"]
|
||||
description = "HTTP access"
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "443"
|
||||
protocol = "tcp"
|
||||
source_ips = ["0.0.0.0/0", "::/0"]
|
||||
description = "HTTPS access"
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "8000"
|
||||
protocol = "tcp"
|
||||
source_ips = ["0.0.0.0/0", "::/0"]
|
||||
description = "API access"
|
||||
}
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
type = "web"
|
||||
}
|
||||
}
|
||||
|
||||
# Firewall for monitoring
|
||||
resource "hcloud_firewall" "monitoring" {
|
||||
name = "${var.environment}-monitoring-firewall"
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "3000"
|
||||
protocol = "tcp"
|
||||
source_ips = var.allowed_ssh_cidrs
|
||||
description = "Grafana access"
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "9090"
|
||||
protocol = "tcp"
|
||||
source_ips = var.allowed_ssh_cidrs
|
||||
description = "Prometheus access"
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "9100"
|
||||
protocol = "tcp"
|
||||
source_ips = [var.private_network_cidr]
|
||||
description = "Node exporter access from private network"
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "9835"
|
||||
protocol = "tcp"
|
||||
source_ips = [var.private_network_cidr]
|
||||
description = "nvidia-smi exporter access from private network"
|
||||
}
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
type = "monitoring"
|
||||
}
|
||||
}
|
||||
|
||||
# Firewall for internal communication
|
||||
resource "hcloud_firewall" "internal" {
|
||||
name = "${var.environment}-internal-firewall"
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "any"
|
||||
protocol = "tcp"
|
||||
source_ips = [var.private_network_cidr]
|
||||
description = "Internal TCP traffic"
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "any"
|
||||
protocol = "udp"
|
||||
source_ips = [var.private_network_cidr]
|
||||
description = "Internal UDP traffic"
|
||||
}
|
||||
|
||||
rule {
|
||||
direction = "in"
|
||||
port = "any"
|
||||
protocol = "icmp"
|
||||
source_ips = [var.private_network_cidr]
|
||||
description = "Internal ICMP traffic"
|
||||
}
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
type = "internal"
|
||||
}
|
||||
}
|
||||
|
||||
# Placement group for better performance and availability
|
||||
resource "hcloud_placement_group" "main" {
|
||||
name = "${var.environment}-ai-placement-group"
|
||||
type = "spread"
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
}
|
||||
}
|
||||
|
||||
# Volume for shared storage (models, data)
|
||||
resource "hcloud_volume" "shared_storage" {
|
||||
name = "${var.environment}-shared-storage"
|
||||
size = var.storage_size
|
||||
location = "fsn1"
|
||||
format = "ext4"
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
type = "shared-storage"
|
||||
}
|
||||
}
|
||||
|
||||
# Load balancer for external access
|
||||
resource "hcloud_load_balancer" "main" {
|
||||
name = "${var.environment}-main-lb"
|
||||
load_balancer_type = "lb11"
|
||||
location = "fsn1"
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
type = "main-loadbalancer"
|
||||
}
|
||||
}
|
||||
|
||||
resource "hcloud_load_balancer_network" "main" {
|
||||
load_balancer_id = hcloud_load_balancer.main.id
|
||||
network_id = hcloud_network.main.id
|
||||
ip = "10.0.2.100"
|
||||
}
|
||||
|
||||
# Certificate for HTTPS
|
||||
resource "hcloud_certificate" "main" {
|
||||
count = var.domain_name != "" ? 1 : 0
|
||||
|
||||
name = "${var.environment}-ssl-cert"
|
||||
type = "managed"
|
||||
domain_names = [var.domain_name]
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
}
|
||||
}
|
||||
|
||||
# Random password for internal services
|
||||
resource "random_password" "internal_secret" {
|
||||
length = 32
|
||||
special = true
|
||||
}
|
||||
|
||||
# Local file for Ansible inventory template
|
||||
resource "local_file" "inventory_template" {
|
||||
content = templatefile("${path.module}/templates/inventory.yml.tpl", {
|
||||
environment = var.environment
|
||||
network_cidr = var.private_network_cidr
|
||||
gex44_subnet = var.gex44_subnet
|
||||
cloud_subnet = var.cloud_subnet
|
||||
})
|
||||
|
||||
filename = "${path.module}/../../../ansible/inventory/${var.environment}-template.yml"
|
||||
}
|
||||
87
terraform/modules/hcloud-base/outputs.tf
Normal file
87
terraform/modules/hcloud-base/outputs.tf
Normal file
@ -0,0 +1,87 @@
|
||||
# Outputs for hcloud-base module
|
||||
|
||||
output "network_id" {
|
||||
description = "ID of the private network"
|
||||
value = hcloud_network.main.id
|
||||
}
|
||||
|
||||
output "network_name" {
|
||||
description = "Name of the private network"
|
||||
value = hcloud_network.main.name
|
||||
}
|
||||
|
||||
output "network_cidr" {
|
||||
description = "CIDR block of the private network"
|
||||
value = hcloud_network.main.ip_range
|
||||
}
|
||||
|
||||
output "gex44_subnet_id" {
|
||||
description = "ID of the GEX44 subnet"
|
||||
value = hcloud_network_subnet.gex44.id
|
||||
}
|
||||
|
||||
output "cloud_subnet_id" {
|
||||
description = "ID of the cloud subnet"
|
||||
value = hcloud_network_subnet.cloud.id
|
||||
}
|
||||
|
||||
output "ssh_key_id" {
|
||||
description = "ID of the SSH key"
|
||||
value = local.ssh_key_id
|
||||
}
|
||||
|
||||
output "ssh_key_name" {
|
||||
description = "Name of the SSH key"
|
||||
value = local.ssh_key_name
|
||||
}
|
||||
|
||||
output "placement_group_id" {
|
||||
description = "ID of the placement group"
|
||||
value = hcloud_placement_group.main.id
|
||||
}
|
||||
|
||||
output "shared_storage_id" {
|
||||
description = "ID of the shared storage volume"
|
||||
value = hcloud_volume.shared_storage.id
|
||||
}
|
||||
|
||||
output "load_balancer_id" {
|
||||
description = "ID of the main load balancer"
|
||||
value = hcloud_load_balancer.main.id
|
||||
}
|
||||
|
||||
output "load_balancer_ip" {
|
||||
description = "Public IP of the main load balancer"
|
||||
value = hcloud_load_balancer.main.public_ipv4
|
||||
}
|
||||
|
||||
output "firewall_ids" {
|
||||
description = "IDs of created firewalls"
|
||||
value = {
|
||||
ssh = hcloud_firewall.ssh.id
|
||||
web = hcloud_firewall.web.id
|
||||
monitoring = hcloud_firewall.monitoring.id
|
||||
internal = hcloud_firewall.internal.id
|
||||
}
|
||||
}
|
||||
|
||||
output "firewall_rules" {
|
||||
description = "Summary of firewall rules"
|
||||
value = {
|
||||
ssh_allowed_cidrs = var.allowed_ssh_cidrs
|
||||
web_ports = ["80", "443", "8000"]
|
||||
monitoring_ports = ["3000", "9090", "9100", "9835"]
|
||||
internal_network = var.private_network_cidr
|
||||
}
|
||||
}
|
||||
|
||||
output "certificate_id" {
|
||||
description = "ID of the SSL certificate"
|
||||
value = var.domain_name != "" ? hcloud_certificate.main[0].id : null
|
||||
}
|
||||
|
||||
output "internal_secret" {
|
||||
description = "Generated internal secret for services"
|
||||
value = random_password.internal_secret.result
|
||||
sensitive = true
|
||||
}
|
||||
48
terraform/modules/hcloud-base/templates/inventory.yml.tpl
Normal file
48
terraform/modules/hcloud-base/templates/inventory.yml.tpl
Normal file
@ -0,0 +1,48 @@
|
||||
# Ansible inventory template for ${environment} environment
|
||||
# Generated by Terraform - do not edit manually
|
||||
|
||||
all:
|
||||
vars:
|
||||
ansible_user: ubuntu
|
||||
ansible_ssh_private_key_file: ~/.ssh/hetzner_key
|
||||
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
|
||||
|
||||
children:
|
||||
cloud_servers:
|
||||
vars:
|
||||
network_zone: eu-central
|
||||
private_network: ${network_cidr}
|
||||
subnet: ${cloud_subnet}
|
||||
|
||||
gex44_servers:
|
||||
vars:
|
||||
network_zone: eu-central
|
||||
private_network: ${network_cidr}
|
||||
subnet: ${gex44_subnet}
|
||||
gpu_type: rtx_4000_ada
|
||||
vram_size: 20
|
||||
|
||||
hosts:
|
||||
gex44-1:
|
||||
ansible_host: 10.0.1.10
|
||||
gpu_index: 0
|
||||
|
||||
gex44-2:
|
||||
ansible_host: 10.0.1.11
|
||||
gpu_index: 1
|
||||
|
||||
gex44-3:
|
||||
ansible_host: 10.0.1.12
|
||||
gpu_index: 2
|
||||
|
||||
load_balancers:
|
||||
children:
|
||||
cloud_servers:
|
||||
|
||||
api_gateways:
|
||||
children:
|
||||
cloud_servers:
|
||||
|
||||
monitoring:
|
||||
children:
|
||||
cloud_servers:
|
||||
59
terraform/modules/hcloud-base/variables.tf
Normal file
59
terraform/modules/hcloud-base/variables.tf
Normal file
@ -0,0 +1,59 @@
|
||||
# Variables for hcloud-base module
|
||||
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "ssh_public_key" {
|
||||
description = "SSH public key content"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "ssh_key_name" {
|
||||
description = "Name for the SSH key"
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "network_zone" {
|
||||
description = "Hetzner Cloud network zone"
|
||||
type = string
|
||||
default = "eu-central"
|
||||
}
|
||||
|
||||
variable "private_network_cidr" {
|
||||
description = "CIDR block for private network"
|
||||
type = string
|
||||
default = "10.0.0.0/16"
|
||||
}
|
||||
|
||||
variable "gex44_subnet" {
|
||||
description = "Subnet for GEX44 servers"
|
||||
type = string
|
||||
default = "10.0.1.0/24"
|
||||
}
|
||||
|
||||
variable "cloud_subnet" {
|
||||
description = "Subnet for cloud servers"
|
||||
type = string
|
||||
default = "10.0.2.0/24"
|
||||
}
|
||||
|
||||
variable "allowed_ssh_cidrs" {
|
||||
description = "CIDR blocks allowed for SSH access"
|
||||
type = list(string)
|
||||
default = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
variable "storage_size" {
|
||||
description = "Size of shared storage volume in GB"
|
||||
type = number
|
||||
default = 500
|
||||
}
|
||||
|
||||
variable "domain_name" {
|
||||
description = "Domain name for SSL certificate"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
218
terraform/modules/load-balancer/cloud-init/haproxy-init.yaml
Normal file
218
terraform/modules/load-balancer/cloud-init/haproxy-init.yaml
Normal file
@ -0,0 +1,218 @@
|
||||
#cloud-config
|
||||
# HAProxy Load Balancer cloud-init configuration
|
||||
|
||||
package_update: true
|
||||
package_upgrade: true
|
||||
|
||||
packages:
|
||||
- haproxy
|
||||
- certbot
|
||||
- python3-certbot-apache
|
||||
- htop
|
||||
- curl
|
||||
- jq
|
||||
- prometheus-node-exporter
|
||||
|
||||
write_files:
|
||||
- path: /etc/haproxy/haproxy.cfg
|
||||
content: |
|
||||
global
|
||||
log stdout local0
|
||||
chroot /var/lib/haproxy
|
||||
stats socket /run/haproxy/admin.sock mode 660 level admin
|
||||
stats timeout 30s
|
||||
user haproxy
|
||||
group haproxy
|
||||
daemon
|
||||
|
||||
# Improved SSL settings
|
||||
ssl-default-bind-ciphers ECDHE+aRSA+AES256+GCM+SHA384:ECDHE+aRSA+CHACHA20:ECDHE+aRSA+AES128+GCM+SHA256:ECDHE+aRSA+AES256+SHA384:ECDHE+aRSA+AES128+SHA256:ECDHE+aRSA+AES256+SHA256:DHE+aRSA+AES256+GCM+SHA384:DHE+aRSA+CHACHA20:DHE+aRSA+AES128+GCM+SHA256:DHE+aRSA+AES256+SHA256:DHE+aRSA+AES128+SHA256:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!SRP:!CAMELLIA
|
||||
ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11
|
||||
ssl-default-server-ciphers ECDHE+aRSA+AES256+GCM+SHA384:ECDHE+aRSA+CHACHA20:ECDHE+aRSA+AES128+GCM+SHA256:ECDHE+aRSA+AES256+SHA384:ECDHE+aRSA+AES128+SHA256:ECDHE+aRSA+AES256+SHA256:DHE+aRSA+AES256+GCM+SHA384:DHE+aRSA+CHACHA20:DHE+aRSA+AES128+GCM+SHA256:DHE+aRSA+AES256+SHA256:DHE+aRSA+AES128+SHA256:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!SRP:!CAMELLIA
|
||||
ssl-default-server-options no-sslv3 no-tlsv10 no-tlsv11
|
||||
|
||||
defaults
|
||||
mode http
|
||||
log global
|
||||
option httplog
|
||||
option dontlognull
|
||||
option log-health-checks
|
||||
option forwardfor
|
||||
option http-server-close
|
||||
timeout connect 5s
|
||||
timeout client 50s
|
||||
timeout server 50s
|
||||
timeout http-request 15s
|
||||
timeout http-keep-alive 15s
|
||||
errorfile 400 /etc/haproxy/errors/400.http
|
||||
errorfile 403 /etc/haproxy/errors/403.http
|
||||
errorfile 408 /etc/haproxy/errors/408.http
|
||||
errorfile 500 /etc/haproxy/errors/500.http
|
||||
errorfile 502 /etc/haproxy/errors/502.http
|
||||
errorfile 503 /etc/haproxy/errors/503.http
|
||||
errorfile 504 /etc/haproxy/errors/504.http
|
||||
|
||||
frontend api_frontend
|
||||
bind *:80
|
||||
bind *:443 ssl crt /etc/ssl/certs/haproxy.pem
|
||||
|
||||
# Redirect HTTP to HTTPS
|
||||
redirect scheme https if !{ ssl_fc }
|
||||
|
||||
# Health check endpoint
|
||||
acl health_check path_beg /health
|
||||
use_backend health_backend if health_check
|
||||
|
||||
# API endpoints
|
||||
acl api_path path_beg /v1/
|
||||
use_backend vllm_backend if api_path
|
||||
|
||||
# Default to API
|
||||
default_backend vllm_backend
|
||||
|
||||
backend vllm_backend
|
||||
balance roundrobin
|
||||
option httpchk GET /health
|
||||
http-check expect status 200
|
||||
|
||||
# Add retry logic
|
||||
retries 3
|
||||
timeout server 60s
|
||||
timeout connect 10s
|
||||
|
||||
%{~ for idx, ip in gex44_ips ~}
|
||||
server gex44-${idx + 1} ${ip}:8000 check inter 10s fall 3 rise 2 weight 100
|
||||
%{~ endfor ~}
|
||||
|
||||
backend health_backend
|
||||
http-request return status 200 content-type "application/json" string '{"status":"healthy","service":"load-balancer","environment":"${environment}","timestamp":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'"}'
|
||||
|
||||
listen stats
|
||||
bind *:8404
|
||||
stats enable
|
||||
stats uri /stats
|
||||
stats refresh 10s
|
||||
stats admin if TRUE
|
||||
stats auth admin:admin123
|
||||
permissions: '0644'
|
||||
|
||||
- path: /etc/logrotate.d/haproxy
|
||||
content: |
|
||||
/var/log/haproxy.log {
|
||||
daily
|
||||
missingok
|
||||
rotate 52
|
||||
compress
|
||||
delaycompress
|
||||
notifempty
|
||||
create 644 syslog adm
|
||||
postrotate
|
||||
/bin/kill -HUP `cat /var/run/rsyslogd.pid 2> /dev/null` 2> /dev/null || true
|
||||
endrotate
|
||||
}
|
||||
permissions: '0644'
|
||||
|
||||
- path: /etc/rsyslog.d/49-haproxy.conf
|
||||
content: |
|
||||
# Send HAProxy messages to a dedicated logfile
|
||||
:programname, startswith, "haproxy" /var/log/haproxy.log
|
||||
& stop
|
||||
permissions: '0644'
|
||||
|
||||
- path: /opt/health-check.sh
|
||||
permissions: '0755'
|
||||
content: |
|
||||
#!/bin/bash
|
||||
# Health check script for HAProxy backends
|
||||
|
||||
check_backend() {
|
||||
local backend_ip=$1
|
||||
local backend_port=${2:-8000}
|
||||
local health_path=${3:-/health}
|
||||
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://$backend_ip:$backend_port$health_path")
|
||||
|
||||
if [ "$response" == "200" ]; then
|
||||
echo "✓ Backend $backend_ip:$backend_port is healthy"
|
||||
return 0
|
||||
else
|
||||
echo "✗ Backend $backend_ip:$backend_port is unhealthy (HTTP $response)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== HAProxy Backend Health Check ==="
|
||||
echo "Timestamp: $(date)"
|
||||
echo "Environment: ${environment}"
|
||||
echo ""
|
||||
|
||||
all_healthy=true
|
||||
%{~ for ip in gex44_ips ~}
|
||||
if ! check_backend "${ip}"; then
|
||||
all_healthy=false
|
||||
fi
|
||||
%{~ endfor ~}
|
||||
|
||||
echo ""
|
||||
if [ "$all_healthy" = true ]; then
|
||||
echo "🎉 All backends are healthy!"
|
||||
exit 0
|
||||
else
|
||||
echo "⚠️ Some backends are unhealthy!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- path: /opt/haproxy-reload.sh
|
||||
permissions: '0755'
|
||||
content: |
|
||||
#!/bin/bash
|
||||
# Script to safely reload HAProxy configuration
|
||||
|
||||
echo "Testing HAProxy configuration..."
|
||||
if haproxy -f /etc/haproxy/haproxy.cfg -c; then
|
||||
echo "Configuration is valid. Reloading HAProxy..."
|
||||
systemctl reload haproxy
|
||||
echo "HAProxy reloaded successfully."
|
||||
else
|
||||
echo "Configuration test failed. Not reloading HAProxy."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
runcmd:
|
||||
# Enable and start services
|
||||
- systemctl enable haproxy
|
||||
- systemctl enable prometheus-node-exporter
|
||||
- systemctl restart rsyslog
|
||||
- systemctl start prometheus-node-exporter
|
||||
|
||||
# Generate self-signed certificate for HTTPS (replace with Let's Encrypt later)
|
||||
- openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/ssl/private/haproxy.key -out /etc/ssl/certs/haproxy.crt -subj "/C=DE/ST=Hessen/L=Frankfurt/O=AI Infrastructure/CN=api.${environment}.local"
|
||||
- cat /etc/ssl/certs/haproxy.crt /etc/ssl/private/haproxy.key > /etc/ssl/certs/haproxy.pem
|
||||
|
||||
# Start HAProxy
|
||||
- systemctl start haproxy
|
||||
|
||||
# Setup health check cron job
|
||||
- echo "*/2 * * * * root /opt/health-check.sh >> /var/log/backend-health.log 2>&1" >> /etc/crontab
|
||||
|
||||
# Setup log rotation
|
||||
- logrotate -f /etc/logrotate.d/haproxy
|
||||
|
||||
final_message: |
|
||||
HAProxy Load Balancer for ${environment} environment is ready!
|
||||
|
||||
Services running:
|
||||
- HAProxy on ports 80, 443
|
||||
- Statistics on port 8404 (/stats)
|
||||
- Node Exporter on port 9100
|
||||
|
||||
Backend servers:
|
||||
%{~ for idx, ip in gex44_ips ~}
|
||||
- GEX44-${idx + 1}: ${ip}:8000
|
||||
%{~ endfor ~}
|
||||
|
||||
Health check: curl http://localhost/health
|
||||
Stats: http://localhost:8404/stats (admin/admin123)
|
||||
|
||||
Logs: /var/log/haproxy.log
|
||||
Backend health: /var/log/backend-health.log
|
||||
163
terraform/modules/load-balancer/main.tf
Normal file
163
terraform/modules/load-balancer/main.tf
Normal file
@ -0,0 +1,163 @@
|
||||
# Load Balancer module for AI Infrastructure
|
||||
|
||||
# Cloud-init script for HAProxy configuration
|
||||
locals {
|
||||
cloud_init = base64encode(templatefile("${path.module}/cloud-init/haproxy-init.yaml", {
|
||||
gex44_ips = var.gex44_ips
|
||||
environment = var.environment
|
||||
}))
|
||||
}
|
||||
|
||||
# Load balancer server
|
||||
resource "hcloud_server" "load_balancer" {
|
||||
name = "${var.environment}-load-balancer"
|
||||
server_type = var.server_type
|
||||
image = "ubuntu-22.04"
|
||||
location = "fsn1"
|
||||
|
||||
ssh_keys = [var.ssh_key_name]
|
||||
|
||||
user_data = local.cloud_init
|
||||
|
||||
network {
|
||||
network_id = var.network_id
|
||||
ip = var.private_ip
|
||||
}
|
||||
|
||||
firewall_ids = var.firewall_ids
|
||||
|
||||
public_net {
|
||||
ipv4_enabled = true
|
||||
ipv6_enabled = false
|
||||
}
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
role = "load-balancer"
|
||||
type = "haproxy"
|
||||
}
|
||||
}
|
||||
|
||||
# Volume attachment for logs and config
|
||||
resource "hcloud_volume_attachment" "lb_storage" {
|
||||
count = var.enable_persistent_storage ? 1 : 0
|
||||
volume_id = var.storage_volume_id
|
||||
server_id = hcloud_server.load_balancer.id
|
||||
automount = true
|
||||
}
|
||||
|
||||
# Floating IP for high availability (optional)
|
||||
resource "hcloud_floating_ip" "lb_floating_ip" {
|
||||
count = var.enable_floating_ip ? 1 : 0
|
||||
type = "ipv4"
|
||||
home_location = "fsn1"
|
||||
name = "${var.environment}-lb-floating-ip"
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
role = "load-balancer-floating"
|
||||
}
|
||||
}
|
||||
|
||||
resource "hcloud_floating_ip_assignment" "lb_floating_ip" {
|
||||
count = var.enable_floating_ip ? 1 : 0
|
||||
floating_ip_id = hcloud_floating_ip.lb_floating_ip[0].id
|
||||
server_id = hcloud_server.load_balancer.id
|
||||
}
|
||||
|
||||
# Load balancer configuration (using Hetzner Cloud Load Balancer as alternative)
|
||||
resource "hcloud_load_balancer" "api_lb" {
|
||||
count = var.enable_cloud_lb ? 1 : 0
|
||||
name = "${var.environment}-api-cloud-lb"
|
||||
load_balancer_type = "lb11"
|
||||
location = "fsn1"
|
||||
|
||||
labels = {
|
||||
environment = var.environment
|
||||
managed_by = "terraform"
|
||||
project = "ai-infrastructure"
|
||||
role = "cloud-load-balancer"
|
||||
}
|
||||
}
|
||||
|
||||
resource "hcloud_load_balancer_network" "api_lb" {
|
||||
count = var.enable_cloud_lb ? 1 : 0
|
||||
load_balancer_id = hcloud_load_balancer.api_lb[0].id
|
||||
network_id = var.network_id
|
||||
ip = "10.0.2.101"
|
||||
}
|
||||
|
||||
# Health check target group for GEX44 servers
|
||||
resource "hcloud_load_balancer_target" "gex44_targets" {
|
||||
count = var.enable_cloud_lb ? length(var.gex44_ips) : 0
|
||||
type = "ip"
|
||||
load_balancer_id = hcloud_load_balancer.api_lb[0].id
|
||||
ip = var.gex44_ips[count.index]
|
||||
use_private_ip = true
|
||||
|
||||
targets {
|
||||
type = "ip"
|
||||
ip = var.gex44_ips[count.index]
|
||||
}
|
||||
}
|
||||
|
||||
# HTTP service configuration
|
||||
resource "hcloud_load_balancer_service" "api_http" {
|
||||
count = var.enable_cloud_lb ? 1 : 0
|
||||
load_balancer_id = hcloud_load_balancer.api_lb[0].id
|
||||
protocol = "http"
|
||||
listen_port = 80
|
||||
destination_port = 8000
|
||||
|
||||
health_check {
|
||||
protocol = "http"
|
||||
port = 8000
|
||||
interval = 15
|
||||
timeout = 10
|
||||
retries = 3
|
||||
http {
|
||||
path = "/health"
|
||||
status_codes = ["200"]
|
||||
}
|
||||
}
|
||||
|
||||
http {
|
||||
sticky_sessions = false
|
||||
redirect_http = false
|
||||
cookie_name = "HCLBSTICKY"
|
||||
cookie_lifetime = 300
|
||||
}
|
||||
}
|
||||
|
||||
# HTTPS service configuration
|
||||
resource "hcloud_load_balancer_service" "api_https" {
|
||||
count = var.enable_cloud_lb && var.ssl_certificate_id != null ? 1 : 0
|
||||
load_balancer_id = hcloud_load_balancer.api_lb[0].id
|
||||
protocol = "https"
|
||||
listen_port = 443
|
||||
destination_port = 8000
|
||||
|
||||
health_check {
|
||||
protocol = "http"
|
||||
port = 8000
|
||||
interval = 15
|
||||
timeout = 10
|
||||
retries = 3
|
||||
http {
|
||||
path = "/health"
|
||||
status_codes = ["200"]
|
||||
}
|
||||
}
|
||||
|
||||
http {
|
||||
sticky_sessions = false
|
||||
redirect_http = true
|
||||
cookie_name = "HCLBSTICKY"
|
||||
cookie_lifetime = 300
|
||||
certificates = [var.ssl_certificate_id]
|
||||
}
|
||||
}
|
||||
133
terraform/modules/load-balancer/variables.tf
Normal file
133
terraform/modules/load-balancer/variables.tf
Normal file
@ -0,0 +1,133 @@
|
||||
# Variables for load-balancer module
|
||||
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "network_id" {
|
||||
description = "ID of the private network"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "subnet_id" {
|
||||
description = "ID of the subnet"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "ssh_key_name" {
|
||||
description = "Name of the SSH key"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "server_type" {
|
||||
description = "Hetzner Cloud server type for load balancer"
|
||||
type = string
|
||||
default = "cx31" # 8 vCPU, 32GB RAM
|
||||
}
|
||||
|
||||
variable "private_ip" {
|
||||
description = "Private IP address for the load balancer"
|
||||
type = string
|
||||
default = "10.0.2.10"
|
||||
}
|
||||
|
||||
variable "gex44_ips" {
|
||||
description = "List of GEX44 server IP addresses"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "firewall_ids" {
|
||||
description = "List of firewall IDs to apply"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "enable_floating_ip" {
|
||||
description = "Enable floating IP for high availability"
|
||||
type = bool
|
||||
default = false
|
||||
}
|
||||
|
||||
variable "enable_cloud_lb" {
|
||||
description = "Enable Hetzner Cloud Load Balancer instead of HAProxy"
|
||||
type = bool
|
||||
default = false
|
||||
}
|
||||
|
||||
variable "enable_persistent_storage" {
|
||||
description = "Enable persistent storage volume"
|
||||
type = bool
|
||||
default = false
|
||||
}
|
||||
|
||||
variable "storage_volume_id" {
|
||||
description = "ID of storage volume to attach"
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "ssl_certificate_id" {
|
||||
description = "ID of SSL certificate for HTTPS"
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "health_check_path" {
|
||||
description = "Health check path for backend servers"
|
||||
type = string
|
||||
default = "/health"
|
||||
}
|
||||
|
||||
variable "load_balancing_algorithm" {
|
||||
description = "Load balancing algorithm (round_robin, least_connections, ip_hash)"
|
||||
type = string
|
||||
default = "round_robin"
|
||||
|
||||
validation {
|
||||
condition = contains(["round_robin", "least_connections", "ip_hash"], var.load_balancing_algorithm)
|
||||
error_message = "Load balancing algorithm must be round_robin, least_connections, or ip_hash."
|
||||
}
|
||||
}
|
||||
|
||||
variable "enable_session_persistence" {
|
||||
description = "Enable session persistence (sticky sessions)"
|
||||
type = bool
|
||||
default = false
|
||||
}
|
||||
|
||||
variable "max_connections" {
|
||||
description = "Maximum number of connections per backend server"
|
||||
type = number
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "connection_timeout" {
|
||||
description = "Connection timeout in seconds"
|
||||
type = number
|
||||
default = 5
|
||||
}
|
||||
|
||||
variable "enable_http_redirect" {
|
||||
description = "Redirect HTTP to HTTPS"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "enable_monitoring" {
|
||||
description = "Enable HAProxy monitoring endpoint"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "monitoring_port" {
|
||||
description = "Port for HAProxy monitoring interface"
|
||||
type = number
|
||||
default = 8404
|
||||
}
|
||||
|
||||
variable "monitoring_uri" {
|
||||
description = "URI for HAProxy monitoring interface"
|
||||
type = string
|
||||
default = "/stats"
|
||||
}
|
||||
170
terraform/outputs.tf
Normal file
170
terraform/outputs.tf
Normal file
@ -0,0 +1,170 @@
|
||||
# Outputs for AI Infrastructure
|
||||
|
||||
# Network information
|
||||
output "private_network_id" {
|
||||
description = "ID of the private network"
|
||||
value = module.hcloud_base.network_id
|
||||
}
|
||||
|
||||
output "private_network_cidr" {
|
||||
description = "CIDR block of the private network"
|
||||
value = var.private_network_cidr
|
||||
}
|
||||
|
||||
# Load balancer information
|
||||
output "load_balancer_ip" {
|
||||
description = "Public IP address of the load balancer"
|
||||
value = module.load_balancer.public_ip
|
||||
}
|
||||
|
||||
output "load_balancer_private_ip" {
|
||||
description = "Private IP address of the load balancer"
|
||||
value = module.load_balancer.private_ip
|
||||
}
|
||||
|
||||
# API Gateway information
|
||||
output "api_gateway_ip" {
|
||||
description = "Public IP address of the API gateway"
|
||||
value = module.api_gateway.public_ip
|
||||
}
|
||||
|
||||
output "api_gateway_private_ip" {
|
||||
description = "Private IP address of the API gateway"
|
||||
value = module.api_gateway.private_ip
|
||||
}
|
||||
|
||||
# Monitoring information
|
||||
output "monitoring_ip" {
|
||||
description = "Public IP address of the monitoring server"
|
||||
value = module.monitoring.public_ip
|
||||
}
|
||||
|
||||
output "monitoring_private_ip" {
|
||||
description = "Private IP address of the monitoring server"
|
||||
value = module.monitoring.private_ip
|
||||
}
|
||||
|
||||
output "grafana_url" {
|
||||
description = "URL to access Grafana dashboard"
|
||||
value = "https://${module.monitoring.public_ip}:3000"
|
||||
}
|
||||
|
||||
output "prometheus_url" {
|
||||
description = "URL to access Prometheus"
|
||||
value = "http://${module.monitoring.public_ip}:9090"
|
||||
}
|
||||
|
||||
# GEX44 configuration
|
||||
output "gex44_config_ips" {
|
||||
description = "IP addresses of GEX44 configuration helpers"
|
||||
value = module.gex44_config.server_ips
|
||||
}
|
||||
|
||||
output "gex44_target_ips" {
|
||||
description = "Target IP addresses for GEX44 servers"
|
||||
value = [
|
||||
"10.0.1.10",
|
||||
"10.0.1.11",
|
||||
"10.0.1.12"
|
||||
]
|
||||
}
|
||||
|
||||
# API endpoints
|
||||
output "api_endpoints" {
|
||||
description = "API endpoints for different services"
|
||||
value = {
|
||||
inference = "http://${module.load_balancer.public_ip}/v1/chat/completions"
|
||||
models = "http://${module.load_balancer.public_ip}/v1/models"
|
||||
health = "http://${module.load_balancer.public_ip}/health"
|
||||
metrics = "http://${module.load_balancer.public_ip}/metrics"
|
||||
}
|
||||
}
|
||||
|
||||
# Connection information
|
||||
output "ssh_commands" {
|
||||
description = "SSH commands to connect to servers"
|
||||
value = {
|
||||
load_balancer = "ssh -i ~/.ssh/hetzner_key ubuntu@${module.load_balancer.public_ip}"
|
||||
api_gateway = "ssh -i ~/.ssh/hetzner_key ubuntu@${module.api_gateway.public_ip}"
|
||||
monitoring = "ssh -i ~/.ssh/hetzner_key ubuntu@${module.monitoring.public_ip}"
|
||||
}
|
||||
}
|
||||
|
||||
# Cost tracking information
|
||||
output "estimated_monthly_cost" {
|
||||
description = "Estimated monthly cost in EUR"
|
||||
value = {
|
||||
load_balancer = 22.68 # cx31
|
||||
api_gateway = 22.68 # cx31
|
||||
monitoring = 11.76 # cx21
|
||||
storage = var.additional_storage_size * 0.05 # 0.05 EUR/GB/month
|
||||
total_cloud = 22.68 + 22.68 + 11.76 + (var.additional_storage_size * 0.05)
|
||||
gex44_per_server = 184.00
|
||||
gex44_total = var.gex44_count * 184.00
|
||||
total_monthly = 22.68 + 22.68 + 11.76 + (var.additional_storage_size * 0.05) + (var.gex44_count * 184.00)
|
||||
}
|
||||
}
|
||||
|
||||
# Environment information
|
||||
output "environment_info" {
|
||||
description = "Environment configuration summary"
|
||||
value = {
|
||||
environment = var.environment
|
||||
gex44_count = var.gex44_count
|
||||
network_zone = var.network_zone
|
||||
auto_scaling = var.enable_auto_scaling
|
||||
backup_enabled = var.enable_backups
|
||||
firewall_enabled = var.enable_firewall
|
||||
}
|
||||
}
|
||||
|
||||
# Security information
|
||||
output "firewall_rules" {
|
||||
description = "Applied firewall rules"
|
||||
value = module.hcloud_base.firewall_rules
|
||||
}
|
||||
|
||||
# Backup information
|
||||
output "backup_info" {
|
||||
description = "Backup configuration"
|
||||
value = {
|
||||
enabled = var.enable_backups
|
||||
retention_days = var.backup_retention_days
|
||||
schedule = "Daily at 3:00 AM UTC"
|
||||
}
|
||||
}
|
||||
|
||||
# Auto-scaling configuration
|
||||
output "autoscaling_config" {
|
||||
description = "Auto-scaling configuration"
|
||||
value = {
|
||||
enabled = var.enable_auto_scaling
|
||||
scale_up_threshold = var.scale_up_threshold
|
||||
scale_down_threshold = var.scale_down_threshold
|
||||
min_servers = var.min_gex44_count
|
||||
max_servers = var.max_gex44_count
|
||||
}
|
||||
}
|
||||
|
||||
# Quick start information
|
||||
output "quick_start_guide" {
|
||||
description = "Quick start commands"
|
||||
value = {
|
||||
health_check = "curl -f http://${module.load_balancer.public_ip}/health"
|
||||
list_models = "curl http://${module.load_balancer.public_ip}/v1/models"
|
||||
test_inference = "curl -X POST http://${module.load_balancer.public_ip}/v1/chat/completions -H 'Content-Type: application/json' -d '{\"model\":\"mixtral-8x7b\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}]}'"
|
||||
monitoring = "open https://${module.monitoring.public_ip}:3000"
|
||||
ssh_lb = "ssh -i ~/.ssh/hetzner_key ubuntu@${module.load_balancer.public_ip}"
|
||||
}
|
||||
}
|
||||
|
||||
# Terraform state information
|
||||
output "terraform_info" {
|
||||
description = "Terraform configuration information"
|
||||
value = {
|
||||
terraform_version = "~> 1.5"
|
||||
hcloud_provider = "~> 1.45"
|
||||
state_backend = "Remote (configure in backend.tf)"
|
||||
last_applied = timestamp()
|
||||
}
|
||||
}
|
||||
218
terraform/variables.tf
Normal file
218
terraform/variables.tf
Normal file
@ -0,0 +1,218 @@
|
||||
# Variables for AI Infrastructure Terraform configuration
|
||||
|
||||
# Core configuration
|
||||
variable "environment" {
|
||||
description = "Environment name (dev, staging, production)"
|
||||
type = string
|
||||
validation {
|
||||
condition = contains(["dev", "staging", "production"], var.environment)
|
||||
error_message = "Environment must be dev, staging, or production."
|
||||
}
|
||||
}
|
||||
|
||||
variable "hcloud_token" {
|
||||
description = "Hetzner Cloud API token"
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
# SSH configuration
|
||||
variable "ssh_public_key" {
|
||||
description = "SSH public key content for server access"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "ssh_key_name" {
|
||||
description = "Name of the SSH key in Hetzner Cloud"
|
||||
type = string
|
||||
default = "ai-infrastructure"
|
||||
}
|
||||
|
||||
# Network configuration
|
||||
variable "network_zone" {
|
||||
description = "Hetzner Cloud network zone"
|
||||
type = string
|
||||
default = "eu-central"
|
||||
}
|
||||
|
||||
variable "private_network_cidr" {
|
||||
description = "CIDR block for private network"
|
||||
type = string
|
||||
default = "10.0.0.0/16"
|
||||
}
|
||||
|
||||
variable "gex44_subnet" {
|
||||
description = "Subnet for GEX44 servers"
|
||||
type = string
|
||||
default = "10.0.1.0/24"
|
||||
}
|
||||
|
||||
variable "cloud_subnet" {
|
||||
description = "Subnet for cloud servers"
|
||||
type = string
|
||||
default = "10.0.2.0/24"
|
||||
}
|
||||
|
||||
variable "allowed_ssh_cidrs" {
|
||||
description = "CIDR blocks allowed for SSH access"
|
||||
type = list(string)
|
||||
default = ["0.0.0.0/0"] # Restrict this in production
|
||||
}
|
||||
|
||||
# GEX44 configuration
|
||||
variable "gex44_count" {
|
||||
description = "Number of GEX44 servers to configure"
|
||||
type = number
|
||||
default = 3
|
||||
validation {
|
||||
condition = var.gex44_count >= 1 && var.gex44_count <= 10
|
||||
error_message = "GEX44 count must be between 1 and 10."
|
||||
}
|
||||
}
|
||||
|
||||
# Auto-scaling configuration
|
||||
variable "scale_up_threshold" {
|
||||
description = "GPU utilization threshold for scaling up (0-1)"
|
||||
type = number
|
||||
default = 0.8
|
||||
validation {
|
||||
condition = var.scale_up_threshold >= 0.5 && var.scale_up_threshold <= 1.0
|
||||
error_message = "Scale up threshold must be between 0.5 and 1.0."
|
||||
}
|
||||
}
|
||||
|
||||
variable "scale_down_threshold" {
|
||||
description = "GPU utilization threshold for scaling down (0-1)"
|
||||
type = number
|
||||
default = 0.3
|
||||
validation {
|
||||
condition = var.scale_down_threshold >= 0.1 && var.scale_down_threshold <= 0.5
|
||||
error_message = "Scale down threshold must be between 0.1 and 0.5."
|
||||
}
|
||||
}
|
||||
|
||||
variable "min_gex44_count" {
|
||||
description = "Minimum number of GEX44 servers"
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
variable "max_gex44_count" {
|
||||
description = "Maximum number of GEX44 servers"
|
||||
type = number
|
||||
default = 10
|
||||
}
|
||||
|
||||
# Monitoring configuration
|
||||
variable "monitoring_retention_days" {
|
||||
description = "Prometheus data retention in days"
|
||||
type = number
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "grafana_admin_password" {
|
||||
description = "Grafana admin password"
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
# CI/CD configuration
|
||||
variable "ansible_repo_url" {
|
||||
description = "Git repository URL for Ansible configuration"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "gitlab_deploy_token" {
|
||||
description = "GitLab deploy token for repository access"
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
variable "vault_password" {
|
||||
description = "Ansible Vault password"
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
# Optional configurations
|
||||
variable "enable_backups" {
|
||||
description = "Enable automatic backups"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "backup_retention_days" {
|
||||
description = "Backup retention period in days"
|
||||
type = number
|
||||
default = 7
|
||||
}
|
||||
|
||||
variable "enable_auto_scaling" {
|
||||
description = "Enable automatic GPU server scaling"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "api_domain" {
|
||||
description = "Domain for API endpoint"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "monitoring_domain" {
|
||||
description = "Domain for monitoring dashboard"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
# Cost tracking
|
||||
variable "project_name" {
|
||||
description = "Project name for cost tracking"
|
||||
type = string
|
||||
default = "ai-infrastructure"
|
||||
}
|
||||
|
||||
variable "cost_center" {
|
||||
description = "Cost center for billing"
|
||||
type = string
|
||||
default = "engineering"
|
||||
}
|
||||
|
||||
# Security configuration
|
||||
variable "enable_firewall" {
|
||||
description = "Enable cloud firewall"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "allowed_api_cidrs" {
|
||||
description = "CIDR blocks allowed for API access"
|
||||
type = list(string)
|
||||
default = ["0.0.0.0/0"] # Restrict this in production
|
||||
}
|
||||
|
||||
# Performance tuning
|
||||
variable "load_balancer_type" {
|
||||
description = "Load balancer server type"
|
||||
type = string
|
||||
default = "cx31" # 8 vCPU, 32GB RAM
|
||||
}
|
||||
|
||||
variable "api_gateway_type" {
|
||||
description = "API Gateway server type"
|
||||
type = string
|
||||
default = "cx31" # 8 vCPU, 32GB RAM
|
||||
}
|
||||
|
||||
variable "monitoring_type" {
|
||||
description = "Monitoring server type"
|
||||
type = string
|
||||
default = "cx21" # 4 vCPU, 16GB RAM
|
||||
}
|
||||
|
||||
# Storage configuration
|
||||
variable "additional_storage_size" {
|
||||
description = "Additional storage size in GB for models/data"
|
||||
type = number
|
||||
default = 500
|
||||
}
|
||||
40
terraform/versions.tf
Normal file
40
terraform/versions.tf
Normal file
@ -0,0 +1,40 @@
|
||||
# Terraform version constraints and provider requirements
|
||||
|
||||
terraform {
|
||||
required_version = ">= 1.5"
|
||||
|
||||
required_providers {
|
||||
hcloud = {
|
||||
source = "hetznercloud/hcloud"
|
||||
version = "~> 1.45"
|
||||
}
|
||||
|
||||
random = {
|
||||
source = "hashicorp/random"
|
||||
version = "~> 3.1"
|
||||
}
|
||||
|
||||
tls = {
|
||||
source = "hashicorp/tls"
|
||||
version = "~> 4.0"
|
||||
}
|
||||
|
||||
local = {
|
||||
source = "hashicorp/local"
|
||||
version = "~> 2.1"
|
||||
}
|
||||
|
||||
template = {
|
||||
source = "hashicorp/template"
|
||||
version = "~> 2.2"
|
||||
}
|
||||
}
|
||||
|
||||
# Backend configuration - uncomment and configure for remote state
|
||||
# backend "s3" {
|
||||
# bucket = "your-terraform-state-bucket"
|
||||
# key = "ai-infrastructure/terraform.tfstate"
|
||||
# region = "eu-central-1"
|
||||
# encrypt = true
|
||||
# }
|
||||
}
|
||||
468
tests/contracts/test_inference_api.py
Normal file
468
tests/contracts/test_inference_api.py
Normal file
@ -0,0 +1,468 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Contract tests for AI Inference API using Pact framework.
|
||||
These tests ensure API compatibility between consumer and provider.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
import requests
|
||||
import time
|
||||
from typing import Dict, Any, List
|
||||
from pact import Consumer, Provider, Like, EachLike, Term, Format
|
||||
from unittest.mock import Mock
|
||||
|
||||
# Pact configuration
|
||||
pact = Consumer('ai-frontend').has_pact_with(Provider('inference-api'))
|
||||
|
||||
class TestInferenceAPIContracts:
|
||||
"""Test suite for inference API contracts"""
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def api_url(self):
|
||||
"""Get API URL from environment or use default"""
|
||||
return os.getenv('API_URL', 'http://localhost:8000')
|
||||
|
||||
def test_health_endpoint_contract(self):
|
||||
"""Test /health endpoint contract"""
|
||||
expected_response = {
|
||||
"status": Like("healthy"),
|
||||
"service": Like("inference-api"),
|
||||
"timestamp": Format().iso_8601_datetime(),
|
||||
"version": Like("1.0.0"),
|
||||
"gpu_count": Like(3),
|
||||
"models_loaded": Like(["mixtral-8x7b"])
|
||||
}
|
||||
|
||||
(pact
|
||||
.given('inference service is healthy')
|
||||
.upon_receiving('a health check request')
|
||||
.with_request('GET', '/health')
|
||||
.will_respond_with(200, body=expected_response))
|
||||
|
||||
with pact:
|
||||
response = requests.get(pact.uri + '/health')
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data['status'] == 'healthy'
|
||||
assert 'timestamp' in data
|
||||
assert isinstance(data['gpu_count'], int)
|
||||
|
||||
def test_models_endpoint_contract(self):
|
||||
"""Test /v1/models endpoint contract"""
|
||||
expected_response = {
|
||||
"object": "list",
|
||||
"data": EachLike({
|
||||
"id": Like("mixtral-8x7b"),
|
||||
"object": "model",
|
||||
"created": Like(1699046400),
|
||||
"owned_by": Like("mistralai"),
|
||||
"permissions": Like([]),
|
||||
"root": Like("mixtral-8x7b"),
|
||||
"parent": Like(None)
|
||||
})
|
||||
}
|
||||
|
||||
(pact
|
||||
.given('models are loaded')
|
||||
.upon_receiving('a models list request')
|
||||
.with_request('GET', '/v1/models')
|
||||
.will_respond_with(200, body=expected_response))
|
||||
|
||||
with pact:
|
||||
response = requests.get(pact.uri + '/v1/models')
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data['object'] == 'list'
|
||||
assert len(data['data']) > 0
|
||||
assert all('id' in model for model in data['data'])
|
||||
|
||||
def test_chat_completion_contract(self):
|
||||
"""Test /v1/chat/completions endpoint contract"""
|
||||
expected_response = {
|
||||
"id": Format().like("chatcmpl-123"),
|
||||
"object": "chat.completion",
|
||||
"created": Like(1699046400),
|
||||
"model": Like("mixtral-8x7b"),
|
||||
"choices": EachLike({
|
||||
"index": Like(0),
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": Like("Hello! How can I help you today?")
|
||||
},
|
||||
"finish_reason": Like("stop")
|
||||
}),
|
||||
"usage": {
|
||||
"prompt_tokens": Like(10),
|
||||
"completion_tokens": Like(20),
|
||||
"total_tokens": Like(30)
|
||||
},
|
||||
"system_fingerprint": Like("fp_44709d6fcb")
|
||||
}
|
||||
|
||||
request_body = {
|
||||
"model": "mixtral-8x7b",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hello"}
|
||||
],
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.7,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
(pact
|
||||
.given('inference server is ready')
|
||||
.upon_receiving('a chat completion request')
|
||||
.with_request('POST', '/v1/chat/completions',
|
||||
headers={'Content-Type': 'application/json'},
|
||||
body=request_body)
|
||||
.will_respond_with(200, body=expected_response))
|
||||
|
||||
with pact:
|
||||
response = requests.post(
|
||||
pact.uri + '/v1/chat/completions',
|
||||
json=request_body,
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert 'choices' in data
|
||||
assert len(data['choices']) > 0
|
||||
assert data['choices'][0]['message']['role'] == 'assistant'
|
||||
assert 'usage' in data
|
||||
|
||||
def test_streaming_completion_contract(self):
|
||||
"""Test streaming completion contract"""
|
||||
expected_response = [
|
||||
{
|
||||
"id": Format().like("chatcmpl-123"),
|
||||
"object": "chat.completion.chunk",
|
||||
"created": Like(1699046400),
|
||||
"model": Like("mixtral-8x7b"),
|
||||
"choices": EachLike({
|
||||
"index": Like(0),
|
||||
"delta": {"content": Like("Hello")},
|
||||
"finish_reason": Like(None)
|
||||
})
|
||||
},
|
||||
{
|
||||
"id": Format().like("chatcmpl-123"),
|
||||
"object": "chat.completion.chunk",
|
||||
"created": Like(1699046400),
|
||||
"model": Like("mixtral-8x7b"),
|
||||
"choices": EachLike({
|
||||
"index": Like(0),
|
||||
"delta": {},
|
||||
"finish_reason": Like("stop")
|
||||
})
|
||||
}
|
||||
]
|
||||
|
||||
request_body = {
|
||||
"model": "mixtral-8x7b",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
"stream": True
|
||||
}
|
||||
|
||||
(pact
|
||||
.given('inference server supports streaming')
|
||||
.upon_receiving('a streaming chat completion request')
|
||||
.with_request('POST', '/v1/chat/completions',
|
||||
headers={'Content-Type': 'application/json'},
|
||||
body=request_body)
|
||||
.will_respond_with(200,
|
||||
headers={'Content-Type': 'text/event-stream'},
|
||||
body=expected_response))
|
||||
|
||||
with pact:
|
||||
response = requests.post(
|
||||
pact.uri + '/v1/chat/completions',
|
||||
json=request_body,
|
||||
headers={'Content-Type': 'application/json'},
|
||||
stream=True
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert 'text/event-stream' in response.headers.get('Content-Type', '')
|
||||
|
||||
def test_error_handling_contract(self):
|
||||
"""Test error response contract"""
|
||||
error_response = {
|
||||
"error": {
|
||||
"message": Like("Invalid request: model not found"),
|
||||
"type": Like("invalid_request_error"),
|
||||
"param": Like("model"),
|
||||
"code": Like("model_not_found")
|
||||
}
|
||||
}
|
||||
|
||||
request_body = {
|
||||
"model": "non-existent-model",
|
||||
"messages": [{"role": "user", "content": "Hello"}]
|
||||
}
|
||||
|
||||
(pact
|
||||
.given('model does not exist')
|
||||
.upon_receiving('a request with invalid model')
|
||||
.with_request('POST', '/v1/chat/completions',
|
||||
headers={'Content-Type': 'application/json'},
|
||||
body=request_body)
|
||||
.will_respond_with(400, body=error_response))
|
||||
|
||||
with pact:
|
||||
response = requests.post(
|
||||
pact.uri + '/v1/chat/completions',
|
||||
json=request_body,
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
|
||||
assert response.status_code == 400
|
||||
data = response.json()
|
||||
assert 'error' in data
|
||||
assert 'message' in data['error']
|
||||
|
||||
def test_rate_limiting_contract(self):
|
||||
"""Test rate limiting behavior"""
|
||||
rate_limit_response = {
|
||||
"error": {
|
||||
"message": Like("Rate limit exceeded"),
|
||||
"type": Like("rate_limit_error"),
|
||||
"code": Like("rate_limit_exceeded")
|
||||
}
|
||||
}
|
||||
|
||||
(pact
|
||||
.given('rate limit is exceeded')
|
||||
.upon_receiving('a request that exceeds rate limit')
|
||||
.with_request('POST', '/v1/chat/completions',
|
||||
headers={'Content-Type': 'application/json'})
|
||||
.will_respond_with(429,
|
||||
headers={'Retry-After': Like('60')},
|
||||
body=rate_limit_response))
|
||||
|
||||
with pact:
|
||||
response = requests.post(
|
||||
pact.uri + '/v1/chat/completions',
|
||||
json={"model": "mixtral-8x7b", "messages": []},
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
|
||||
assert response.status_code == 429
|
||||
assert 'Retry-After' in response.headers
|
||||
|
||||
def test_metrics_endpoint_contract(self):
|
||||
"""Test /metrics endpoint contract"""
|
||||
# Prometheus metrics format validation
|
||||
(pact
|
||||
.given('metrics are being collected')
|
||||
.upon_receiving('a metrics request')
|
||||
.with_request('GET', '/metrics')
|
||||
.will_respond_with(200,
|
||||
headers={'Content-Type': 'text/plain; version=0.0.4; charset=utf-8'},
|
||||
body=Like('# HELP vllm_requests_total Total number of requests\n')))
|
||||
|
||||
with pact:
|
||||
response = requests.get(pact.uri + '/metrics')
|
||||
assert response.status_code == 200
|
||||
assert 'text/plain' in response.headers.get('Content-Type', '')
|
||||
assert 'vllm_requests_total' in response.text
|
||||
|
||||
|
||||
class TestAPIIntegration:
|
||||
"""Integration tests for actual API endpoints"""
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def api_url(self):
|
||||
return os.getenv('API_URL', 'http://localhost:8000')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def wait_for_api(self, api_url):
|
||||
"""Wait for API to be ready"""
|
||||
max_retries = 30
|
||||
retry_interval = 10
|
||||
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
response = requests.get(f"{api_url}/health", timeout=5)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
|
||||
if i < max_retries - 1:
|
||||
time.sleep(retry_interval)
|
||||
|
||||
pytest.fail(f"API at {api_url} did not become ready within {max_retries * retry_interval} seconds")
|
||||
|
||||
def test_health_endpoint(self, api_url, wait_for_api):
|
||||
"""Test actual health endpoint"""
|
||||
response = requests.get(f"{api_url}/health")
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert data['status'] == 'healthy'
|
||||
assert 'timestamp' in data
|
||||
assert 'gpu_count' in data
|
||||
|
||||
def test_models_endpoint(self, api_url, wait_for_api):
|
||||
"""Test actual models endpoint"""
|
||||
response = requests.get(f"{api_url}/v1/models")
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert data['object'] == 'list'
|
||||
assert len(data['data']) > 0
|
||||
|
||||
# Verify model structure
|
||||
model = data['data'][0]
|
||||
assert 'id' in model
|
||||
assert 'object' in model
|
||||
assert model['object'] == 'model'
|
||||
|
||||
def test_simple_completion(self, api_url, wait_for_api):
|
||||
"""Test simple completion request"""
|
||||
request_data = {
|
||||
"model": "mixtral-8x7b",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Say 'Hello, World!' and nothing else."}
|
||||
],
|
||||
"max_tokens": 10,
|
||||
"temperature": 0.1
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{api_url}/v1/chat/completions",
|
||||
json=request_data,
|
||||
headers={'Content-Type': 'application/json'},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Validate response structure
|
||||
assert 'choices' in data
|
||||
assert len(data['choices']) > 0
|
||||
assert 'message' in data['choices'][0]
|
||||
assert 'content' in data['choices'][0]['message']
|
||||
assert 'usage' in data
|
||||
|
||||
# Validate usage metrics
|
||||
usage = data['usage']
|
||||
assert 'prompt_tokens' in usage
|
||||
assert 'completion_tokens' in usage
|
||||
assert 'total_tokens' in usage
|
||||
assert usage['total_tokens'] == usage['prompt_tokens'] + usage['completion_tokens']
|
||||
|
||||
def test_completion_performance(self, api_url, wait_for_api):
|
||||
"""Test completion performance requirements"""
|
||||
request_data = {
|
||||
"model": "mixtral-8x7b",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Write a short poem about artificial intelligence."}
|
||||
],
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.7
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
response = requests.post(
|
||||
f"{api_url}/v1/chat/completions",
|
||||
json=request_data,
|
||||
headers={'Content-Type': 'application/json'},
|
||||
timeout=60
|
||||
)
|
||||
end_time = time.time()
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
# Performance requirements
|
||||
response_time = end_time - start_time
|
||||
assert response_time < 30, f"Response time {response_time:.2f}s exceeded 30s limit"
|
||||
|
||||
data = response.json()
|
||||
completion_tokens = data['usage']['completion_tokens']
|
||||
tokens_per_second = completion_tokens / response_time
|
||||
|
||||
# Should generate at least 10 tokens per second
|
||||
assert tokens_per_second >= 10, f"Token generation rate {tokens_per_second:.2f} too slow"
|
||||
|
||||
def test_concurrent_requests(self, api_url, wait_for_api):
|
||||
"""Test handling of concurrent requests"""
|
||||
import concurrent.futures
|
||||
import threading
|
||||
|
||||
def make_request():
|
||||
request_data = {
|
||||
"model": "mixtral-8x7b",
|
||||
"messages": [
|
||||
{"role": "user", "content": f"Count from 1 to 5. Thread: {threading.current_thread().ident}"}
|
||||
],
|
||||
"max_tokens": 20,
|
||||
"temperature": 0.1
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{api_url}/v1/chat/completions",
|
||||
json=request_data,
|
||||
headers={'Content-Type': 'application/json'},
|
||||
timeout=30
|
||||
)
|
||||
return response.status_code, response.json()
|
||||
|
||||
# Make 5 concurrent requests
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||
futures = [executor.submit(make_request) for _ in range(5)]
|
||||
results = [future.result() for future in concurrent.futures.as_completed(futures)]
|
||||
|
||||
# All requests should succeed
|
||||
for status_code, data in results:
|
||||
assert status_code == 200
|
||||
assert 'choices' in data
|
||||
assert len(data['choices']) > 0
|
||||
|
||||
def test_error_handling(self, api_url, wait_for_api):
|
||||
"""Test error handling"""
|
||||
# Test invalid model
|
||||
response = requests.post(
|
||||
f"{api_url}/v1/chat/completions",
|
||||
json={
|
||||
"model": "non-existent-model",
|
||||
"messages": [{"role": "user", "content": "Hello"}]
|
||||
},
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
assert response.status_code == 400
|
||||
|
||||
# Test malformed request
|
||||
response = requests.post(
|
||||
f"{api_url}/v1/chat/completions",
|
||||
json={"invalid": "request"},
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
assert response.status_code == 400
|
||||
|
||||
def test_metrics_endpoint(self, api_url, wait_for_api):
|
||||
"""Test metrics collection"""
|
||||
response = requests.get(f"{api_url}/metrics")
|
||||
assert response.status_code == 200
|
||||
|
||||
metrics_text = response.text
|
||||
|
||||
# Check for essential metrics
|
||||
expected_metrics = [
|
||||
'vllm_requests_total',
|
||||
'vllm_request_duration_seconds',
|
||||
'vllm_tokens_generated_total',
|
||||
'vllm_queue_size'
|
||||
]
|
||||
|
||||
for metric in expected_metrics:
|
||||
assert metric in metrics_text, f"Missing metric: {metric}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run tests with pytest
|
||||
pytest.main([__file__, "-v", "--tb=short"])
|
||||
383
tests/load/k6_inference_test.js
Normal file
383
tests/load/k6_inference_test.js
Normal file
@ -0,0 +1,383 @@
|
||||
// K6 Load Testing Script for AI Inference API
|
||||
// This script tests the inference API under various load conditions
|
||||
|
||||
import http from 'k6/http';
|
||||
import { check, sleep } from 'k6';
|
||||
import { Rate, Trend, Counter } from 'k6/metrics';
|
||||
import { htmlReport } from "https://raw.githubusercontent.com/benc-uk/k6-reporter/main/dist/bundle.js";
|
||||
import { textSummary } from "https://jslib.k6.io/k6-summary/0.0.1/index.js";
|
||||
|
||||
// Custom metrics
|
||||
const failureRate = new Rate('failures');
|
||||
const inferenceLatency = new Trend('inference_latency');
|
||||
const tokenThroughput = new Trend('token_throughput');
|
||||
const queueTime = new Trend('queue_time');
|
||||
const errorCount = new Counter('errors');
|
||||
const tokensGenerated = new Counter('tokens_generated');
|
||||
|
||||
// Test configuration
|
||||
export let options = {
|
||||
stages: [
|
||||
// Warm-up phase
|
||||
{ duration: '2m', target: 5 }, // Ramp up to 5 users
|
||||
|
||||
// Normal load
|
||||
{ duration: '5m', target: 10 }, // Stay at 10 users
|
||||
|
||||
// Peak load
|
||||
{ duration: '3m', target: 25 }, // Ramp up to 25 users
|
||||
{ duration: '5m', target: 25 }, // Stay at 25 users for 5 minutes
|
||||
|
||||
// Stress test
|
||||
{ duration: '2m', target: 50 }, // Ramp up to 50 users
|
||||
{ duration: '3m', target: 50 }, // Stay at 50 users
|
||||
|
||||
// Cool down
|
||||
{ duration: '2m', target: 0 }, // Ramp down to 0 users
|
||||
],
|
||||
|
||||
thresholds: {
|
||||
// Response time requirements
|
||||
'http_req_duration': [
|
||||
'p(50)<2000', // 50% of requests under 2s
|
||||
'p(95)<5000', // 95% of requests under 5s
|
||||
'p(99)<10000' // 99% of requests under 10s
|
||||
],
|
||||
|
||||
// Error rate requirements
|
||||
'http_req_failed': ['rate<0.05'], // Less than 5% errors
|
||||
'failures': ['rate<0.05'], // Less than 5% failures
|
||||
|
||||
// Inference-specific requirements
|
||||
'inference_latency': [
|
||||
'p(95)<3000', // 95% of inferences under 3s
|
||||
],
|
||||
'token_throughput': [
|
||||
'p(50)>20', // At least 20 tokens/sec median
|
||||
],
|
||||
'queue_time': [
|
||||
'p(95)<1000', // 95% of requests queued less than 1s
|
||||
],
|
||||
},
|
||||
|
||||
// External metrics export
|
||||
ext: {
|
||||
loadimpact: {
|
||||
// Project configuration for cloud testing
|
||||
name: 'AI Inference Load Test',
|
||||
distribution: {
|
||||
'amazon:de:frankfurt': { loadZone: 'amazon:de:frankfurt', percent: 100 }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Test configuration from environment
|
||||
const BASE_URL = __ENV.API_URL || 'http://localhost:8000';
|
||||
const MODEL_NAME = __ENV.MODEL_NAME || 'mixtral-8x7b';
|
||||
const TEST_DURATION = __ENV.TEST_DURATION || '20m';
|
||||
|
||||
// Test scenarios with different prompt types
|
||||
const TEST_SCENARIOS = [
|
||||
{
|
||||
name: 'simple_question',
|
||||
weight: 0.4,
|
||||
prompt: 'What is artificial intelligence?',
|
||||
maxTokens: 100,
|
||||
temperature: 0.1
|
||||
},
|
||||
{
|
||||
name: 'code_generation',
|
||||
weight: 0.3,
|
||||
prompt: 'Write a Python function to calculate the factorial of a number.',
|
||||
maxTokens: 200,
|
||||
temperature: 0.2
|
||||
},
|
||||
{
|
||||
name: 'creative_writing',
|
||||
weight: 0.2,
|
||||
prompt: 'Write a short story about a robot learning to paint.',
|
||||
maxTokens: 300,
|
||||
temperature: 0.8
|
||||
},
|
||||
{
|
||||
name: 'long_context',
|
||||
weight: 0.1,
|
||||
prompt: 'Explain the history of machine learning, including major milestones, key researchers, breakthrough algorithms, and their impact on modern AI applications. Be comprehensive and detailed.',
|
||||
maxTokens: 500,
|
||||
temperature: 0.5
|
||||
}
|
||||
];
|
||||
|
||||
// Helper function to select test scenario
|
||||
function selectScenario() {
|
||||
const random = Math.random();
|
||||
let cumulativeWeight = 0;
|
||||
|
||||
for (const scenario of TEST_SCENARIOS) {
|
||||
cumulativeWeight += scenario.weight;
|
||||
if (random <= cumulativeWeight) {
|
||||
return scenario;
|
||||
}
|
||||
}
|
||||
|
||||
return TEST_SCENARIOS[0]; // fallback
|
||||
}
|
||||
|
||||
// Main test function
|
||||
export default function() {
|
||||
const scenario = selectScenario();
|
||||
|
||||
// Prepare request payload
|
||||
const payload = JSON.stringify({
|
||||
model: MODEL_NAME,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: scenario.prompt
|
||||
}
|
||||
],
|
||||
max_tokens: scenario.maxTokens,
|
||||
temperature: scenario.temperature,
|
||||
stream: false
|
||||
});
|
||||
|
||||
const params = {
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
tags: {
|
||||
scenario: scenario.name
|
||||
},
|
||||
timeout: '60s' // 60 second timeout
|
||||
};
|
||||
|
||||
// Record start time
|
||||
const startTime = Date.now();
|
||||
|
||||
// Make the request
|
||||
const response = http.post(`${BASE_URL}/v1/chat/completions`, payload, params);
|
||||
|
||||
// Record end time and calculate metrics
|
||||
const endTime = Date.now();
|
||||
const requestDuration = endTime - startTime;
|
||||
|
||||
// Check response
|
||||
const success = check(response, {
|
||||
'status is 200': (r) => r.status === 200,
|
||||
'response has body': (r) => r.body && r.body.length > 0,
|
||||
'response time < 30s': (r) => r.timings.duration < 30000,
|
||||
'has completion': (r) => {
|
||||
if (r.status !== 200) return false;
|
||||
try {
|
||||
const body = JSON.parse(r.body);
|
||||
return body.choices && body.choices.length > 0 && body.choices[0].message;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
'has usage stats': (r) => {
|
||||
if (r.status !== 200) return false;
|
||||
try {
|
||||
const body = JSON.parse(r.body);
|
||||
return body.usage &&
|
||||
typeof body.usage.prompt_tokens === 'number' &&
|
||||
typeof body.usage.completion_tokens === 'number';
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (!success) {
|
||||
failureRate.add(1);
|
||||
errorCount.add(1);
|
||||
console.error(`Request failed: Status ${response.status}, Scenario: ${scenario.name}`);
|
||||
if (response.body) {
|
||||
console.error(`Response body: ${response.body.substring(0, 200)}...`);
|
||||
}
|
||||
} else {
|
||||
failureRate.add(0);
|
||||
|
||||
// Parse response for detailed metrics
|
||||
try {
|
||||
const body = JSON.parse(response.body);
|
||||
|
||||
// Record inference metrics
|
||||
inferenceLatency.add(requestDuration);
|
||||
|
||||
if (body.usage) {
|
||||
const completionTokens = body.usage.completion_tokens;
|
||||
const totalTokens = body.usage.total_tokens;
|
||||
|
||||
tokensGenerated.add(completionTokens);
|
||||
|
||||
// Calculate token throughput (tokens per second)
|
||||
const throughput = completionTokens / (requestDuration / 1000);
|
||||
tokenThroughput.add(throughput);
|
||||
}
|
||||
|
||||
// Estimate queue time (time before processing started)
|
||||
// This is an approximation based on response headers or timing
|
||||
const queueTimeMs = Math.max(0, requestDuration - (response.timings.duration || requestDuration));
|
||||
queueTime.add(queueTimeMs);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Failed to parse response: ${e.message}`);
|
||||
errorCount.add(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Test different endpoints periodically
|
||||
if (Math.random() < 0.1) { // 10% of the time
|
||||
testHealthEndpoint();
|
||||
}
|
||||
|
||||
if (Math.random() < 0.05) { // 5% of the time
|
||||
testModelsEndpoint();
|
||||
}
|
||||
|
||||
if (Math.random() < 0.02) { // 2% of the time
|
||||
testMetricsEndpoint();
|
||||
}
|
||||
|
||||
// Variable sleep based on scenario complexity
|
||||
const sleepTime = scenario.name === 'long_context' ? 2 : 1;
|
||||
sleep(sleepTime);
|
||||
}
|
||||
|
||||
// Health endpoint test
|
||||
function testHealthEndpoint() {
|
||||
const response = http.get(`${BASE_URL}/health`, {
|
||||
tags: { endpoint: 'health' },
|
||||
timeout: '10s'
|
||||
});
|
||||
|
||||
check(response, {
|
||||
'health status is 200': (r) => r.status === 200,
|
||||
'health response is valid': (r) => {
|
||||
try {
|
||||
const body = JSON.parse(r.body);
|
||||
return body.status === 'healthy';
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}) || errorCount.add(1);
|
||||
}
|
||||
|
||||
// Models endpoint test
|
||||
function testModelsEndpoint() {
|
||||
const response = http.get(`${BASE_URL}/v1/models`, {
|
||||
tags: { endpoint: 'models' },
|
||||
timeout: '10s'
|
||||
});
|
||||
|
||||
check(response, {
|
||||
'models status is 200': (r) => r.status === 200,
|
||||
'models response is valid': (r) => {
|
||||
try {
|
||||
const body = JSON.parse(r.body);
|
||||
return body.object === 'list' && body.data && body.data.length > 0;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}) || errorCount.add(1);
|
||||
}
|
||||
|
||||
// Metrics endpoint test
|
||||
function testMetricsEndpoint() {
|
||||
const response = http.get(`${BASE_URL}/metrics`, {
|
||||
tags: { endpoint: 'metrics' },
|
||||
timeout: '10s'
|
||||
});
|
||||
|
||||
check(response, {
|
||||
'metrics status is 200': (r) => r.status === 200,
|
||||
'metrics content type': (r) => r.headers['Content-Type'] && r.headers['Content-Type'].includes('text/plain'),
|
||||
'has vllm metrics': (r) => r.body && r.body.includes('vllm_requests_total')
|
||||
}) || errorCount.add(1);
|
||||
}
|
||||
|
||||
// Setup function (run once at the beginning)
|
||||
export function setup() {
|
||||
console.log(`Starting load test against ${BASE_URL}`);
|
||||
console.log(`Model: ${MODEL_NAME}`);
|
||||
console.log(`Test scenarios: ${TEST_SCENARIOS.length}`);
|
||||
|
||||
// Verify API is accessible
|
||||
const response = http.get(`${BASE_URL}/health`);
|
||||
if (response.status !== 200) {
|
||||
throw new Error(`API health check failed: ${response.status} ${response.body}`);
|
||||
}
|
||||
|
||||
// Get available models
|
||||
const modelsResponse = http.get(`${BASE_URL}/v1/models`);
|
||||
if (modelsResponse.status === 200) {
|
||||
try {
|
||||
const models = JSON.parse(modelsResponse.body);
|
||||
console.log(`Available models: ${models.data.map(m => m.id).join(', ')}`);
|
||||
|
||||
// Verify our target model is available
|
||||
const modelExists = models.data.some(model => model.id === MODEL_NAME);
|
||||
if (!modelExists) {
|
||||
console.warn(`Warning: Target model '${MODEL_NAME}' not found in available models`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`Could not parse models response: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return { startTime: Date.now() };
|
||||
}
|
||||
|
||||
// Teardown function (run once at the end)
|
||||
export function teardown(data) {
|
||||
const duration = (Date.now() - data.startTime) / 1000;
|
||||
console.log(`Load test completed in ${duration.toFixed(1)} seconds`);
|
||||
}
|
||||
|
||||
// Custom summary report
|
||||
export function handleSummary(data) {
|
||||
return {
|
||||
"k6-report.html": htmlReport(data),
|
||||
"k6-report.json": JSON.stringify(data, null, 2),
|
||||
"stdout": textSummary(data, { indent: " ", enableColors: true }),
|
||||
};
|
||||
}
|
||||
|
||||
// Stress test scenario (can be run separately)
|
||||
export const stressTest = {
|
||||
executor: 'ramping-arrival-rate',
|
||||
startRate: 1,
|
||||
timeUnit: '1s',
|
||||
preAllocatedVUs: 10,
|
||||
maxVUs: 100,
|
||||
stages: [
|
||||
{ duration: '5m', target: 50 }, // Ramp up to 50 RPS
|
||||
{ duration: '10m', target: 100 }, // Stay at 100 RPS
|
||||
{ duration: '5m', target: 0 }, // Ramp down
|
||||
],
|
||||
exec: 'stressTestFunction'
|
||||
};
|
||||
|
||||
// Stress test function
|
||||
export function stressTestFunction() {
|
||||
// Use simpler, faster requests for stress testing
|
||||
const payload = JSON.stringify({
|
||||
model: MODEL_NAME,
|
||||
messages: [{ role: 'user', content: 'Hello!' }],
|
||||
max_tokens: 10,
|
||||
temperature: 0.1
|
||||
});
|
||||
|
||||
const response = http.post(`${BASE_URL}/v1/chat/completions`, payload, {
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
timeout: '30s'
|
||||
});
|
||||
|
||||
check(response, {
|
||||
'stress test response ok': (r) => r.status === 200
|
||||
}) || errorCount.add(1);
|
||||
}
|
||||
332
tests/terraform/infrastructure_test.go
Normal file
332
tests/terraform/infrastructure_test.go
Normal file
@ -0,0 +1,332 @@
|
||||
// Infrastructure testing with Terratest
|
||||
package test
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gruntwork-io/terratest/modules/azure"
|
||||
"github.com/gruntwork-io/terratest/modules/random"
|
||||
"github.com/gruntwork-io/terratest/modules/retry"
|
||||
"github.com/gruntwork-io/terratest/modules/terraform"
|
||||
"github.com/gruntwork-io/terratest/modules/test-structure"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestTerraformInfrastructure tests the complete infrastructure deployment
|
||||
func TestTerraformInfrastructure(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Pick a random AWS region to test in. This helps ensure your code works in all regions.
|
||||
// We use eu-central-1 for Hetzner compatibility
|
||||
terraformDir := "../../terraform/environments/staging"
|
||||
|
||||
// Construct the terraform options with default retryable errors to handle the most common retryable errors in terraform testing.
|
||||
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
|
||||
// The path to where our Terraform code is located
|
||||
TerraformDir: terraformDir,
|
||||
|
||||
// Variables to pass to our Terraform code using -var options
|
||||
Vars: map[string]interface{}{
|
||||
"environment": "test",
|
||||
"gex44_count": 1,
|
||||
"ssh_public_key": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC...", // Test key
|
||||
"hcloud_token": "dummy-token-for-testing",
|
||||
},
|
||||
|
||||
// Disable colors in Terraform commands so its easier to parse stdout/stderr
|
||||
NoColor: true,
|
||||
})
|
||||
|
||||
// At the end of the test, run `terraform destroy` to clean up any resources that were created
|
||||
defer terraform.Destroy(t, terraformOptions)
|
||||
|
||||
// This will run `terraform init` and `terraform apply` and fail the test if there are any errors
|
||||
terraform.InitAndApply(t, terraformOptions)
|
||||
|
||||
// Run basic infrastructure tests
|
||||
testInfrastructureOutputs(t, terraformOptions)
|
||||
testNetworkConnectivity(t, terraformOptions)
|
||||
testLoadBalancer(t, terraformOptions)
|
||||
testMonitoring(t, terraformOptions)
|
||||
}
|
||||
|
||||
// TestTerraformModules tests individual Terraform modules
|
||||
func TestTerraformModules(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
modulePath string
|
||||
}{
|
||||
{"hcloud-base", "../../terraform/modules/hcloud-base"},
|
||||
{"load-balancer", "../../terraform/modules/load-balancer"},
|
||||
{"monitoring", "../../terraform/modules/monitoring"},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
tc := tc // capture range variable
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
testTerraformModule(t, tc.modulePath)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func testTerraformModule(t *testing.T, modulePath string) {
|
||||
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
|
||||
TerraformDir: modulePath,
|
||||
Vars: map[string]interface{}{
|
||||
"environment": "test",
|
||||
"ssh_public_key": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC...",
|
||||
},
|
||||
NoColor: true,
|
||||
})
|
||||
|
||||
defer terraform.Destroy(t, terraformOptions)
|
||||
terraform.InitAndApply(t, terraformOptions)
|
||||
}
|
||||
|
||||
func testInfrastructureOutputs(t *testing.T, terraformOptions *terraform.Options) {
|
||||
// Test that all required outputs are present and valid
|
||||
loadBalancerIP := terraform.Output(t, terraformOptions, "load_balancer_ip")
|
||||
assert.NotEmpty(t, loadBalancerIP, "Load balancer IP should not be empty")
|
||||
|
||||
monitoringIP := terraform.Output(t, terraformOptions, "monitoring_ip")
|
||||
assert.NotEmpty(t, monitoringIP, "Monitoring IP should not be empty")
|
||||
|
||||
apiEndpoints := terraform.OutputMap(t, terraformOptions, "api_endpoints")
|
||||
assert.Contains(t, apiEndpoints, "inference", "Should contain inference endpoint")
|
||||
assert.Contains(t, apiEndpoints, "health", "Should contain health endpoint")
|
||||
}
|
||||
|
||||
func testNetworkConnectivity(t *testing.T, terraformOptions *terraform.Options) {
|
||||
// Test network connectivity between components
|
||||
privateNetworkID := terraform.Output(t, terraformOptions, "private_network_id")
|
||||
assert.NotEmpty(t, privateNetworkID, "Private network ID should not be empty")
|
||||
|
||||
// Test that servers can communicate over private network
|
||||
// This would require actual server provisioning in a real test
|
||||
}
|
||||
|
||||
func testLoadBalancer(t *testing.T, terraformOptions *terraform.Options) {
|
||||
loadBalancerIP := terraform.Output(t, terraformOptions, "load_balancer_ip")
|
||||
|
||||
// Test load balancer health endpoint
|
||||
healthURL := fmt.Sprintf("http://%s/health", loadBalancerIP)
|
||||
|
||||
// Wait for load balancer to be ready
|
||||
maxRetries := 10
|
||||
timeBetweenRetries := 30 * time.Second
|
||||
|
||||
retry.DoWithRetry(t, "Test load balancer health", maxRetries, timeBetweenRetries, func() (string, error) {
|
||||
resp, err := http.Get(healthURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return "", fmt.Errorf("Expected status 200, got %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return "Load balancer is healthy", nil
|
||||
})
|
||||
}
|
||||
|
||||
func testMonitoring(t *testing.T, terraformOptions *terraform.Options) {
|
||||
monitoringIP := terraform.Output(t, terraformOptions, "monitoring_ip")
|
||||
|
||||
// Test Prometheus endpoint
|
||||
prometheusURL := fmt.Sprintf("http://%s:9090/api/v1/query?query=up", monitoringIP)
|
||||
|
||||
maxRetries := 10
|
||||
timeBetweenRetries := 30 * time.Second
|
||||
|
||||
retry.DoWithRetry(t, "Test Prometheus", maxRetries, timeBetweenRetries, func() (string, error) {
|
||||
resp, err := http.Get(prometheusURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return "", fmt.Errorf("Expected status 200, got %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return "Prometheus is responding", nil
|
||||
})
|
||||
|
||||
// Test Grafana endpoint
|
||||
grafanaURL := fmt.Sprintf("https://%s:3000/api/health", monitoringIP)
|
||||
|
||||
retry.DoWithRetry(t, "Test Grafana", maxRetries, timeBetweenRetries, func() (string, error) {
|
||||
// Skip SSL verification for test
|
||||
tr := &http.Transport{
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
}
|
||||
client := &http.Client{Transport: tr}
|
||||
|
||||
resp, err := client.Get(grafanaURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return "", fmt.Errorf("Expected status 200, got %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return "Grafana is responding", nil
|
||||
})
|
||||
}
|
||||
|
||||
// TestTerraformValidation tests that all Terraform files are valid
|
||||
func TestTerraformValidation(t *testing.T) {
|
||||
environments := []string{"dev", "staging", "production"}
|
||||
|
||||
for _, env := range environments {
|
||||
env := env
|
||||
t.Run(fmt.Sprintf("validate-%s", env), func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
terraformDir := fmt.Sprintf("../../terraform/environments/%s", env)
|
||||
terraformOptions := &terraform.Options{
|
||||
TerraformDir: terraformDir,
|
||||
NoColor: true,
|
||||
}
|
||||
|
||||
terraform.Init(t, terraformOptions)
|
||||
terraform.Validate(t, terraformOptions)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestTerraformPlan tests that Terraform plans complete without errors
|
||||
func TestTerraformPlan(t *testing.T) {
|
||||
terraformDir := "../../terraform/environments/staging"
|
||||
|
||||
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
|
||||
TerraformDir: terraformDir,
|
||||
Vars: map[string]interface{}{
|
||||
"environment": "test",
|
||||
"gex44_count": 1,
|
||||
"ssh_public_key": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC...",
|
||||
"hcloud_token": "dummy-token-for-testing",
|
||||
},
|
||||
PlanFilePath: "test.tfplan",
|
||||
NoColor: true,
|
||||
})
|
||||
|
||||
terraform.Init(t, terraformOptions)
|
||||
terraform.Plan(t, terraformOptions)
|
||||
}
|
||||
|
||||
// TestCostEstimation validates that the infrastructure cost is within expected bounds
|
||||
func TestCostEstimation(t *testing.T) {
|
||||
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
|
||||
TerraformDir: "../../terraform/environments/production",
|
||||
Vars: map[string]interface{}{
|
||||
"environment": "production",
|
||||
"gex44_count": 3,
|
||||
},
|
||||
NoColor: true,
|
||||
})
|
||||
|
||||
terraform.Init(t, terraformOptions)
|
||||
|
||||
// Get estimated monthly cost from outputs
|
||||
estimatedCostOutput := terraform.OutputMap(t, terraformOptions, "estimated_monthly_cost")
|
||||
|
||||
totalCost, exists := estimatedCostOutput["total_monthly"]
|
||||
require.True(t, exists, "total_monthly cost should be in outputs")
|
||||
|
||||
// Validate cost is within expected bounds (should be around 691 EUR)
|
||||
expectedMinCost := 600.0
|
||||
expectedMaxCost := 800.0
|
||||
|
||||
costFloat, ok := totalCost.(float64)
|
||||
require.True(t, ok, "Cost should be a number")
|
||||
|
||||
assert.GreaterOrEqual(t, costFloat, expectedMinCost, "Cost should be at least €600")
|
||||
assert.LessOrEqual(t, costFloat, expectedMaxCost, "Cost should be at most €800")
|
||||
}
|
||||
|
||||
// TestSecurityConfiguration validates security settings
|
||||
func TestSecurityConfiguration(t *testing.T) {
|
||||
terraformDir := "../../terraform/environments/production"
|
||||
|
||||
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
|
||||
TerraformDir: terraformDir,
|
||||
NoColor: true,
|
||||
})
|
||||
|
||||
terraform.Init(t, terraformOptions)
|
||||
|
||||
// Get firewall rules from outputs
|
||||
firewallRules := terraform.OutputMap(t, terraformOptions, "firewall_rules")
|
||||
|
||||
// Validate that SSH is not open to the world in production
|
||||
sshAllowedCIDRs, exists := firewallRules["ssh_allowed_cidrs"]
|
||||
require.True(t, exists, "SSH allowed CIDRs should be defined")
|
||||
|
||||
// In production, SSH should not be 0.0.0.0/0
|
||||
cidrs, ok := sshAllowedCIDRs.([]interface{})
|
||||
require.True(t, ok, "SSH CIDRs should be a list")
|
||||
|
||||
for _, cidr := range cidrs {
|
||||
cidrStr, ok := cidr.(string)
|
||||
require.True(t, ok, "CIDR should be a string")
|
||||
assert.NotEqual(t, "0.0.0.0/0", cidrStr, "SSH should not be open to the world in production")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDisasterRecovery tests backup and recovery capabilities
|
||||
func TestDisasterRecovery(t *testing.T) {
|
||||
terraformDir := "../../terraform/environments/staging"
|
||||
|
||||
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
|
||||
TerraformDir: terraformDir,
|
||||
Vars: map[string]interface{}{
|
||||
"environment": "dr-test",
|
||||
"enable_backups": true,
|
||||
},
|
||||
NoColor: true,
|
||||
})
|
||||
|
||||
defer terraform.Destroy(t, terraformOptions)
|
||||
terraform.InitAndApply(t, terraformOptions)
|
||||
|
||||
// Get backup configuration
|
||||
backupInfo := terraform.OutputMap(t, terraformOptions, "backup_info")
|
||||
|
||||
enabled, exists := backupInfo["enabled"]
|
||||
require.True(t, exists, "Backup enabled flag should exist")
|
||||
assert.True(t, enabled.(bool), "Backups should be enabled")
|
||||
|
||||
retentionDays, exists := backupInfo["retention_days"]
|
||||
require.True(t, exists, "Backup retention should be defined")
|
||||
assert.GreaterOrEqual(t, retentionDays.(float64), 7.0, "Backup retention should be at least 7 days")
|
||||
}
|
||||
|
||||
// Benchmark tests for performance validation
|
||||
func BenchmarkTerraformPlan(b *testing.B) {
|
||||
terraformDir := "../../terraform/environments/staging"
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
terraformOptions := &terraform.Options{
|
||||
TerraformDir: terraformDir,
|
||||
Vars: map[string]interface{}{
|
||||
"environment": fmt.Sprintf("bench-%d", i),
|
||||
},
|
||||
NoColor: true,
|
||||
}
|
||||
|
||||
terraform.Init(b, terraformOptions)
|
||||
terraform.Plan(b, terraformOptions)
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user