2025-09-13 14:18:28 +02:00

250 lines
9.9 KiB
Makefile

.PHONY: help setup test deploy-dev deploy-prod destroy cost-report scale-up scale-down
# Default target
help: ## Show this help message
@echo "AI Infrastructure Management Commands"
@echo "===================================="
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
# Environment detection
ENV ?= dev
TF_DIR = terraform/environments/$(ENV)
ANSIBLE_DIR = ansible
# Setup and dependencies
setup: ## Install all dependencies and tools
@echo "🔧 Installing dependencies..."
@command -v terraform >/dev/null 2>&1 || (echo "❌ Terraform not found. Install from https://terraform.io" && exit 1)
@command -v ansible >/dev/null 2>&1 || (echo "❌ Ansible not found. Install with: pip install ansible" && exit 1)
@command -v go >/dev/null 2>&1 || (echo "❌ Go not found (needed for tests). Install from https://golang.org" && exit 1)
@command -v k6 >/dev/null 2>&1 || (echo "❌ K6 not found. Install from https://k6.io" && exit 1)
@echo "✅ Installing Ansible dependencies..."
cd $(ANSIBLE_DIR) && ansible-galaxy install -r requirements.yml
@echo "✅ Installing Go test dependencies..."
cd tests/terraform && go mod download
@echo "✅ Setup complete!"
# Validation and linting
validate: ## Validate all configurations
@echo "🔍 Validating Terraform configurations..."
@for env in dev staging production; do \
echo "Validating $$env environment..."; \
cd terraform/environments/$$env && terraform init -backend=false && terraform validate && cd ../../../; \
done
@echo "🔍 Validating Ansible playbooks..."
cd $(ANSIBLE_DIR) && ansible-playbook --syntax-check playbooks/site.yml
cd $(ANSIBLE_DIR) && ansible-lint playbooks/
@echo "✅ All configurations valid!"
# Testing
test: validate ## Run all tests
@echo "🧪 Running infrastructure tests..."
cd tests/terraform && go test -v ./...
@echo "🧪 Running Ansible tests..."
cd $(ANSIBLE_DIR)/roles/vllm && molecule test
@echo "🧪 Running contract tests..."
python tests/contracts/test_inference_api.py
@echo "✅ All tests passed!"
test-load: ## Run load tests against deployed infrastructure
@echo "📊 Running load tests..."
@if [ -z "$(API_URL)" ]; then \
echo "❌ API_URL environment variable required"; \
echo "Usage: make test-load API_URL=https://api.yourcompany.com"; \
exit 1; \
fi
API_URL=$(API_URL) k6 run tests/load/k6_inference_test.js
# Infrastructure deployment
plan: ## Plan infrastructure changes
@echo "📋 Planning $(ENV) infrastructure..."
cd $(TF_DIR) && terraform init && terraform plan -out=$(ENV).tfplan
deploy-infra: ## Deploy infrastructure only
@echo "🚀 Deploying $(ENV) infrastructure..."
cd $(TF_DIR) && terraform apply $(ENV).tfplan
@echo "✅ Infrastructure deployed!"
configure-servers: ## Configure servers with Ansible
@echo "⚙️ Configuring servers..."
cd $(ANSIBLE_DIR) && ansible-playbook -i inventory/$(ENV).yml playbooks/site.yml
@echo "✅ Servers configured!"
deploy-dev: plan ## Deploy development environment
@$(MAKE) deploy-infra ENV=dev
@$(MAKE) configure-servers ENV=dev
@echo "🎉 Development environment ready!"
deploy-staging: plan ## Deploy staging environment
@$(MAKE) deploy-infra ENV=staging
@$(MAKE) configure-servers ENV=staging
@echo "🎉 Staging environment ready!"
deploy-prod: ## Deploy production environment (requires manual approval)
@echo "⚠️ Production deployment requires explicit confirmation"
@echo "This will deploy to PRODUCTION environment."
@read -p "Are you sure? [y/N] " -n 1 -r; \
echo; \
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
$(MAKE) plan ENV=production; \
$(MAKE) deploy-infra ENV=production; \
$(MAKE) configure-servers ENV=production; \
echo "🎉 Production environment ready!"; \
else \
echo "❌ Production deployment cancelled"; \
fi
# Scaling operations
scale-up: ## Add one GPU server
@echo "📈 Scaling up GPU servers..."
python scripts/autoscaler.py --action=scale-up --count=1
@echo "✅ Scale up initiated!"
scale-down: ## Remove one GPU server
@echo "📉 Scaling down GPU servers..."
python scripts/autoscaler.py --action=scale-down --count=1
@echo "✅ Scale down initiated!"
# Monitoring and reporting
cost-report: ## Generate cost analysis report
@echo "💰 Generating cost report..."
python scripts/cost-analysis.py --format=markdown > reports/cost-report-$(shell date +%Y%m%d).md
python scripts/cost-analysis.py --format=json > reports/cost-report-$(shell date +%Y%m%d).json
@echo "✅ Cost report generated in reports/"
metrics: ## Show current infrastructure metrics
@echo "📊 Current Infrastructure Metrics"
@echo "=================================="
@python scripts/decision-metrics.py --summary
status: ## Show infrastructure status
@echo "🔍 Infrastructure Status"
@echo "======================="
@cd $(TF_DIR) && terraform show -json | jq -r '.values.root_module.resources[] | select(.type | contains("hcloud")) | "\(.type): \(.values.name) - \(.values.status // "unknown")"'
@echo ""
@echo "🖥️ Server Health"
@echo "==============="
@cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m ping --one-line
# Backup and recovery
backup: ## Create infrastructure backup
@echo "💾 Creating infrastructure backup..."
mkdir -p backups/$(shell date +%Y%m%d)
cd $(TF_DIR) && terraform state pull > ../../../backups/$(shell date +%Y%m%d)/terraform-state-$(ENV).json
cd $(ANSIBLE_DIR) && tar czf ../backups/$(shell date +%Y%m%d)/ansible-inventory-$(ENV).tar.gz inventory/
@echo "✅ Backup created in backups/$(shell date +%Y%m%d)/"
restore: ## Restore infrastructure from backup
@echo "⚠️ This will restore infrastructure from backup"
@if [ -z "$(BACKUP_DATE)" ]; then \
echo "❌ BACKUP_DATE required"; \
echo "Usage: make restore BACKUP_DATE=20241201"; \
exit 1; \
fi
@if [ ! -d "backups/$(BACKUP_DATE)" ]; then \
echo "❌ Backup directory backups/$(BACKUP_DATE) not found"; \
exit 1; \
fi
@read -p "Restore from backup $(BACKUP_DATE)? [y/N] " -n 1 -r; \
echo; \
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
cd $(TF_DIR) && terraform state push ../../../backups/$(BACKUP_DATE)/terraform-state-$(ENV).json; \
echo "✅ State restored from backup"; \
fi
# Cleanup
destroy: ## Destroy infrastructure (requires confirmation)
@echo "💥 This will DESTROY the $(ENV) infrastructure!"
@echo "All servers, data, and configurations will be permanently deleted."
@read -p "Type '$(ENV)-destroy-confirm' to proceed: " -r; \
if [[ "$$REPLY" == "$(ENV)-destroy-confirm" ]]; then \
cd $(TF_DIR) && terraform destroy; \
echo "💥 Infrastructure destroyed!"; \
else \
echo "❌ Destruction cancelled (incorrect confirmation)"; \
fi
clean: ## Clean temporary files and caches
@echo "🧹 Cleaning temporary files..."
find . -name "*.tfplan" -delete
find . -name ".terraform" -type d -exec rm -rf {} +
find . -name "*.pyc" -delete
find . -name "__pycache__" -type d -exec rm -rf {} +
@echo "✅ Cleanup complete!"
# Development helpers
dev-logs: ## Show logs from development environment
@echo "📋 Development Environment Logs"
@echo "=============================="
cd $(ANSIBLE_DIR) && ansible gex44 -i inventory/dev.yml -m shell -a "journalctl -u vllm-api -n 50 --no-pager"
dev-ssh: ## SSH to development GPU server
@echo "🔌 Connecting to development GPU server..."
@SERVER_IP=$$(cd $(TF_DIR) && terraform output -json | jq -r '.gex44_ips.value[0]'); \
ssh -i ~/.ssh/hetzner_key ubuntu@$$SERVER_IP
logs: ## Show logs from specified environment
@if [ -z "$(SERVICE)" ]; then \
echo "📋 Available services: vllm-api, haproxy, prometheus, grafana"; \
echo "Usage: make logs SERVICE=vllm-api ENV=production"; \
exit 1; \
fi
cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m shell -a "journalctl -u $(SERVICE) -n 50 --no-pager"
# Documentation
docs: ## Generate documentation
@echo "📚 Generating documentation..."
@command -v mkdocs >/dev/null 2>&1 || pip install mkdocs
mkdocs build
@echo "✅ Documentation generated in site/"
docs-serve: ## Serve documentation locally
@echo "📖 Serving documentation at http://localhost:8000"
mkdocs serve
# CI/CD helpers
ci-validate: ## Validation for CI pipeline
@$(MAKE) validate
@$(MAKE) test
ci-deploy-staging: ## Deploy staging (for CI)
@$(MAKE) deploy-staging
ci-deploy-production: ## Deploy production (for CI)
@$(MAKE) deploy-prod
# Quick operations
quick-status: ## Quick infrastructure overview
@echo "⚡ Quick Status Overview"
@echo "======================"
@echo "Environment: $(ENV)"
@echo "Terraform state: $$(cd $(TF_DIR) && terraform show -json 2>/dev/null | jq -r '.values.root_module.resources | length // "No resources"') resources"
@python -c "import requests; print('API Health:', 'OK' if requests.get('$(API_URL)/health', timeout=5).status_code == 200 else 'FAIL')" 2>/dev/null || echo "API Health: Unknown (set API_URL)"
@echo "Last backup: $$(ls -1t backups/ | head -1 || echo 'No backups')"
emergency-scale: ## Emergency scale up (bypasses normal limits)
@echo "🚨 EMERGENCY SCALE UP"
@echo "This will immediately order new GPU servers"
@read -p "Number of servers to add [1-5]: " -n 1 -r; \
echo; \
if [[ $$REPLY =~ ^[1-5]$$ ]]; then \
python scripts/autoscaler.py --action=emergency-scale --count=$$REPLY; \
echo "🚨 Emergency scale initiated for $$REPLY servers"; \
else \
echo "❌ Invalid server count"; \
fi
# Environment info
env-info: ## Show environment configuration
@echo "🔍 Environment Information"
@echo "========================="
@echo "Current Environment: $(ENV)"
@echo "Terraform Directory: $(TF_DIR)"
@echo "Ansible Directory: $(ANSIBLE_DIR)"
@echo ""
@echo "Required Environment Variables:"
@echo "------------------------------"
@echo "HCLOUD_TOKEN: $$([ -n "$$HCLOUD_TOKEN" ] && echo "✅ Set" || echo "❌ Missing")"
@echo "ROBOT_API_USER: $$([ -n "$$ROBOT_API_USER" ] && echo "✅ Set" || echo "❌ Missing")"
@echo "ROBOT_API_PASSWORD: $$([ -n "$$ROBOT_API_PASSWORD" ] && echo "✅ Set" || echo "❌ Missing")"
@echo "API_URL: $$([ -n "$$API_URL" ] && echo "✅ Set ($$API_URL)" || echo "❌ Missing")"