250 lines
9.9 KiB
Makefile
250 lines
9.9 KiB
Makefile
.PHONY: help setup test deploy-dev deploy-prod destroy cost-report scale-up scale-down
|
|
|
|
# Default target
|
|
help: ## Show this help message
|
|
@echo "AI Infrastructure Management Commands"
|
|
@echo "===================================="
|
|
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
|
|
|
# Environment detection
|
|
ENV ?= dev
|
|
TF_DIR = terraform/environments/$(ENV)
|
|
ANSIBLE_DIR = ansible
|
|
|
|
# Setup and dependencies
|
|
setup: ## Install all dependencies and tools
|
|
@echo "🔧 Installing dependencies..."
|
|
@command -v terraform >/dev/null 2>&1 || (echo "❌ Terraform not found. Install from https://terraform.io" && exit 1)
|
|
@command -v ansible >/dev/null 2>&1 || (echo "❌ Ansible not found. Install with: pip install ansible" && exit 1)
|
|
@command -v go >/dev/null 2>&1 || (echo "❌ Go not found (needed for tests). Install from https://golang.org" && exit 1)
|
|
@command -v k6 >/dev/null 2>&1 || (echo "❌ K6 not found. Install from https://k6.io" && exit 1)
|
|
@echo "✅ Installing Ansible dependencies..."
|
|
cd $(ANSIBLE_DIR) && ansible-galaxy install -r requirements.yml
|
|
@echo "✅ Installing Go test dependencies..."
|
|
cd tests/terraform && go mod download
|
|
@echo "✅ Setup complete!"
|
|
|
|
# Validation and linting
|
|
validate: ## Validate all configurations
|
|
@echo "🔍 Validating Terraform configurations..."
|
|
@for env in dev staging production; do \
|
|
echo "Validating $$env environment..."; \
|
|
cd terraform/environments/$$env && terraform init -backend=false && terraform validate && cd ../../../; \
|
|
done
|
|
@echo "🔍 Validating Ansible playbooks..."
|
|
cd $(ANSIBLE_DIR) && ansible-playbook --syntax-check playbooks/site.yml
|
|
cd $(ANSIBLE_DIR) && ansible-lint playbooks/
|
|
@echo "✅ All configurations valid!"
|
|
|
|
# Testing
|
|
test: validate ## Run all tests
|
|
@echo "🧪 Running infrastructure tests..."
|
|
cd tests/terraform && go test -v ./...
|
|
@echo "🧪 Running Ansible tests..."
|
|
cd $(ANSIBLE_DIR)/roles/vllm && molecule test
|
|
@echo "🧪 Running contract tests..."
|
|
python tests/contracts/test_inference_api.py
|
|
@echo "✅ All tests passed!"
|
|
|
|
test-load: ## Run load tests against deployed infrastructure
|
|
@echo "📊 Running load tests..."
|
|
@if [ -z "$(API_URL)" ]; then \
|
|
echo "❌ API_URL environment variable required"; \
|
|
echo "Usage: make test-load API_URL=https://api.yourcompany.com"; \
|
|
exit 1; \
|
|
fi
|
|
API_URL=$(API_URL) k6 run tests/load/k6_inference_test.js
|
|
|
|
# Infrastructure deployment
|
|
plan: ## Plan infrastructure changes
|
|
@echo "📋 Planning $(ENV) infrastructure..."
|
|
cd $(TF_DIR) && terraform init && terraform plan -out=$(ENV).tfplan
|
|
|
|
deploy-infra: ## Deploy infrastructure only
|
|
@echo "🚀 Deploying $(ENV) infrastructure..."
|
|
cd $(TF_DIR) && terraform apply $(ENV).tfplan
|
|
@echo "✅ Infrastructure deployed!"
|
|
|
|
configure-servers: ## Configure servers with Ansible
|
|
@echo "⚙️ Configuring servers..."
|
|
cd $(ANSIBLE_DIR) && ansible-playbook -i inventory/$(ENV).yml playbooks/site.yml
|
|
@echo "✅ Servers configured!"
|
|
|
|
deploy-dev: plan ## Deploy development environment
|
|
@$(MAKE) deploy-infra ENV=dev
|
|
@$(MAKE) configure-servers ENV=dev
|
|
@echo "🎉 Development environment ready!"
|
|
|
|
deploy-staging: plan ## Deploy staging environment
|
|
@$(MAKE) deploy-infra ENV=staging
|
|
@$(MAKE) configure-servers ENV=staging
|
|
@echo "🎉 Staging environment ready!"
|
|
|
|
deploy-prod: ## Deploy production environment (requires manual approval)
|
|
@echo "⚠️ Production deployment requires explicit confirmation"
|
|
@echo "This will deploy to PRODUCTION environment."
|
|
@read -p "Are you sure? [y/N] " -n 1 -r; \
|
|
echo; \
|
|
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
|
|
$(MAKE) plan ENV=production; \
|
|
$(MAKE) deploy-infra ENV=production; \
|
|
$(MAKE) configure-servers ENV=production; \
|
|
echo "🎉 Production environment ready!"; \
|
|
else \
|
|
echo "❌ Production deployment cancelled"; \
|
|
fi
|
|
|
|
# Scaling operations
|
|
scale-up: ## Add one GPU server
|
|
@echo "📈 Scaling up GPU servers..."
|
|
python scripts/autoscaler.py --action=scale-up --count=1
|
|
@echo "✅ Scale up initiated!"
|
|
|
|
scale-down: ## Remove one GPU server
|
|
@echo "📉 Scaling down GPU servers..."
|
|
python scripts/autoscaler.py --action=scale-down --count=1
|
|
@echo "✅ Scale down initiated!"
|
|
|
|
# Monitoring and reporting
|
|
cost-report: ## Generate cost analysis report
|
|
@echo "💰 Generating cost report..."
|
|
python scripts/cost-analysis.py --format=markdown > reports/cost-report-$(shell date +%Y%m%d).md
|
|
python scripts/cost-analysis.py --format=json > reports/cost-report-$(shell date +%Y%m%d).json
|
|
@echo "✅ Cost report generated in reports/"
|
|
|
|
metrics: ## Show current infrastructure metrics
|
|
@echo "📊 Current Infrastructure Metrics"
|
|
@echo "=================================="
|
|
@python scripts/decision-metrics.py --summary
|
|
|
|
status: ## Show infrastructure status
|
|
@echo "🔍 Infrastructure Status"
|
|
@echo "======================="
|
|
@cd $(TF_DIR) && terraform show -json | jq -r '.values.root_module.resources[] | select(.type | contains("hcloud")) | "\(.type): \(.values.name) - \(.values.status // "unknown")"'
|
|
@echo ""
|
|
@echo "🖥️ Server Health"
|
|
@echo "==============="
|
|
@cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m ping --one-line
|
|
|
|
# Backup and recovery
|
|
backup: ## Create infrastructure backup
|
|
@echo "💾 Creating infrastructure backup..."
|
|
mkdir -p backups/$(shell date +%Y%m%d)
|
|
cd $(TF_DIR) && terraform state pull > ../../../backups/$(shell date +%Y%m%d)/terraform-state-$(ENV).json
|
|
cd $(ANSIBLE_DIR) && tar czf ../backups/$(shell date +%Y%m%d)/ansible-inventory-$(ENV).tar.gz inventory/
|
|
@echo "✅ Backup created in backups/$(shell date +%Y%m%d)/"
|
|
|
|
restore: ## Restore infrastructure from backup
|
|
@echo "⚠️ This will restore infrastructure from backup"
|
|
@if [ -z "$(BACKUP_DATE)" ]; then \
|
|
echo "❌ BACKUP_DATE required"; \
|
|
echo "Usage: make restore BACKUP_DATE=20241201"; \
|
|
exit 1; \
|
|
fi
|
|
@if [ ! -d "backups/$(BACKUP_DATE)" ]; then \
|
|
echo "❌ Backup directory backups/$(BACKUP_DATE) not found"; \
|
|
exit 1; \
|
|
fi
|
|
@read -p "Restore from backup $(BACKUP_DATE)? [y/N] " -n 1 -r; \
|
|
echo; \
|
|
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
|
|
cd $(TF_DIR) && terraform state push ../../../backups/$(BACKUP_DATE)/terraform-state-$(ENV).json; \
|
|
echo "✅ State restored from backup"; \
|
|
fi
|
|
|
|
# Cleanup
|
|
destroy: ## Destroy infrastructure (requires confirmation)
|
|
@echo "💥 This will DESTROY the $(ENV) infrastructure!"
|
|
@echo "All servers, data, and configurations will be permanently deleted."
|
|
@read -p "Type '$(ENV)-destroy-confirm' to proceed: " -r; \
|
|
if [[ "$$REPLY" == "$(ENV)-destroy-confirm" ]]; then \
|
|
cd $(TF_DIR) && terraform destroy; \
|
|
echo "💥 Infrastructure destroyed!"; \
|
|
else \
|
|
echo "❌ Destruction cancelled (incorrect confirmation)"; \
|
|
fi
|
|
|
|
clean: ## Clean temporary files and caches
|
|
@echo "🧹 Cleaning temporary files..."
|
|
find . -name "*.tfplan" -delete
|
|
find . -name ".terraform" -type d -exec rm -rf {} +
|
|
find . -name "*.pyc" -delete
|
|
find . -name "__pycache__" -type d -exec rm -rf {} +
|
|
@echo "✅ Cleanup complete!"
|
|
|
|
# Development helpers
|
|
dev-logs: ## Show logs from development environment
|
|
@echo "📋 Development Environment Logs"
|
|
@echo "=============================="
|
|
cd $(ANSIBLE_DIR) && ansible gex44 -i inventory/dev.yml -m shell -a "journalctl -u vllm-api -n 50 --no-pager"
|
|
|
|
dev-ssh: ## SSH to development GPU server
|
|
@echo "🔌 Connecting to development GPU server..."
|
|
@SERVER_IP=$$(cd $(TF_DIR) && terraform output -json | jq -r '.gex44_ips.value[0]'); \
|
|
ssh -i ~/.ssh/hetzner_key ubuntu@$$SERVER_IP
|
|
|
|
logs: ## Show logs from specified environment
|
|
@if [ -z "$(SERVICE)" ]; then \
|
|
echo "📋 Available services: vllm-api, haproxy, prometheus, grafana"; \
|
|
echo "Usage: make logs SERVICE=vllm-api ENV=production"; \
|
|
exit 1; \
|
|
fi
|
|
cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m shell -a "journalctl -u $(SERVICE) -n 50 --no-pager"
|
|
|
|
# Documentation
|
|
docs: ## Generate documentation
|
|
@echo "📚 Generating documentation..."
|
|
@command -v mkdocs >/dev/null 2>&1 || pip install mkdocs
|
|
mkdocs build
|
|
@echo "✅ Documentation generated in site/"
|
|
|
|
docs-serve: ## Serve documentation locally
|
|
@echo "📖 Serving documentation at http://localhost:8000"
|
|
mkdocs serve
|
|
|
|
# CI/CD helpers
|
|
ci-validate: ## Validation for CI pipeline
|
|
@$(MAKE) validate
|
|
@$(MAKE) test
|
|
|
|
ci-deploy-staging: ## Deploy staging (for CI)
|
|
@$(MAKE) deploy-staging
|
|
|
|
ci-deploy-production: ## Deploy production (for CI)
|
|
@$(MAKE) deploy-prod
|
|
|
|
# Quick operations
|
|
quick-status: ## Quick infrastructure overview
|
|
@echo "⚡ Quick Status Overview"
|
|
@echo "======================"
|
|
@echo "Environment: $(ENV)"
|
|
@echo "Terraform state: $$(cd $(TF_DIR) && terraform show -json 2>/dev/null | jq -r '.values.root_module.resources | length // "No resources"') resources"
|
|
@python -c "import requests; print('API Health:', 'OK' if requests.get('$(API_URL)/health', timeout=5).status_code == 200 else 'FAIL')" 2>/dev/null || echo "API Health: Unknown (set API_URL)"
|
|
@echo "Last backup: $$(ls -1t backups/ | head -1 || echo 'No backups')"
|
|
|
|
emergency-scale: ## Emergency scale up (bypasses normal limits)
|
|
@echo "🚨 EMERGENCY SCALE UP"
|
|
@echo "This will immediately order new GPU servers"
|
|
@read -p "Number of servers to add [1-5]: " -n 1 -r; \
|
|
echo; \
|
|
if [[ $$REPLY =~ ^[1-5]$$ ]]; then \
|
|
python scripts/autoscaler.py --action=emergency-scale --count=$$REPLY; \
|
|
echo "🚨 Emergency scale initiated for $$REPLY servers"; \
|
|
else \
|
|
echo "❌ Invalid server count"; \
|
|
fi
|
|
|
|
# Environment info
|
|
env-info: ## Show environment configuration
|
|
@echo "🔍 Environment Information"
|
|
@echo "========================="
|
|
@echo "Current Environment: $(ENV)"
|
|
@echo "Terraform Directory: $(TF_DIR)"
|
|
@echo "Ansible Directory: $(ANSIBLE_DIR)"
|
|
@echo ""
|
|
@echo "Required Environment Variables:"
|
|
@echo "------------------------------"
|
|
@echo "HCLOUD_TOKEN: $$([ -n "$$HCLOUD_TOKEN" ] && echo "✅ Set" || echo "❌ Missing")"
|
|
@echo "ROBOT_API_USER: $$([ -n "$$ROBOT_API_USER" ] && echo "✅ Set" || echo "❌ Missing")"
|
|
@echo "ROBOT_API_PASSWORD: $$([ -n "$$ROBOT_API_PASSWORD" ] && echo "✅ Set" || echo "❌ Missing")"
|
|
@echo "API_URL: $$([ -n "$$API_URL" ] && echo "✅ Set ($$API_URL)" || echo "❌ Missing")"
|