.PHONY: help setup test deploy-dev deploy-prod destroy cost-report scale-up scale-down # Default target help: ## Show this help message @echo "AI Infrastructure Management Commands" @echo "====================================" @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) # Environment detection ENV ?= dev TF_DIR = terraform/environments/$(ENV) ANSIBLE_DIR = ansible # Setup and dependencies setup: ## Install all dependencies and tools @echo "๐Ÿ”ง Installing dependencies..." @command -v terraform >/dev/null 2>&1 || (echo "โŒ Terraform not found. Install from https://terraform.io" && exit 1) @command -v ansible >/dev/null 2>&1 || (echo "โŒ Ansible not found. Install with: pip install ansible" && exit 1) @command -v go >/dev/null 2>&1 || (echo "โŒ Go not found (needed for tests). Install from https://golang.org" && exit 1) @command -v k6 >/dev/null 2>&1 || (echo "โŒ K6 not found. Install from https://k6.io" && exit 1) @echo "โœ… Installing Ansible dependencies..." cd $(ANSIBLE_DIR) && ansible-galaxy install -r requirements.yml @echo "โœ… Installing Go test dependencies..." cd tests/terraform && go mod download @echo "โœ… Setup complete!" # Validation and linting validate: ## Validate all configurations @echo "๐Ÿ” Validating Terraform configurations..." @for env in dev staging production; do \ echo "Validating $$env environment..."; \ cd terraform/environments/$$env && terraform init -backend=false && terraform validate && cd ../../../; \ done @echo "๐Ÿ” Validating Ansible playbooks..." cd $(ANSIBLE_DIR) && ansible-playbook --syntax-check playbooks/site.yml cd $(ANSIBLE_DIR) && ansible-lint playbooks/ @echo "โœ… All configurations valid!" # Testing test: validate ## Run all tests @echo "๐Ÿงช Running infrastructure tests..." cd tests/terraform && go test -v ./... @echo "๐Ÿงช Running Ansible tests..." cd $(ANSIBLE_DIR)/roles/vllm && molecule test @echo "๐Ÿงช Running contract tests..." python tests/contracts/test_inference_api.py @echo "โœ… All tests passed!" test-load: ## Run load tests against deployed infrastructure @echo "๐Ÿ“Š Running load tests..." @if [ -z "$(API_URL)" ]; then \ echo "โŒ API_URL environment variable required"; \ echo "Usage: make test-load API_URL=https://api.yourcompany.com"; \ exit 1; \ fi API_URL=$(API_URL) k6 run tests/load/k6_inference_test.js # Infrastructure deployment plan: ## Plan infrastructure changes @echo "๐Ÿ“‹ Planning $(ENV) infrastructure..." cd $(TF_DIR) && terraform init && terraform plan -out=$(ENV).tfplan deploy-infra: ## Deploy infrastructure only @echo "๐Ÿš€ Deploying $(ENV) infrastructure..." cd $(TF_DIR) && terraform apply $(ENV).tfplan @echo "โœ… Infrastructure deployed!" configure-servers: ## Configure servers with Ansible @echo "โš™๏ธ Configuring servers..." cd $(ANSIBLE_DIR) && ansible-playbook -i inventory/$(ENV).yml playbooks/site.yml @echo "โœ… Servers configured!" deploy-dev: plan ## Deploy development environment @$(MAKE) deploy-infra ENV=dev @$(MAKE) configure-servers ENV=dev @echo "๐ŸŽ‰ Development environment ready!" deploy-staging: plan ## Deploy staging environment @$(MAKE) deploy-infra ENV=staging @$(MAKE) configure-servers ENV=staging @echo "๐ŸŽ‰ Staging environment ready!" deploy-prod: ## Deploy production environment (requires manual approval) @echo "โš ๏ธ Production deployment requires explicit confirmation" @echo "This will deploy to PRODUCTION environment." @read -p "Are you sure? [y/N] " -n 1 -r; \ echo; \ if [[ $$REPLY =~ ^[Yy]$$ ]]; then \ $(MAKE) plan ENV=production; \ $(MAKE) deploy-infra ENV=production; \ $(MAKE) configure-servers ENV=production; \ echo "๐ŸŽ‰ Production environment ready!"; \ else \ echo "โŒ Production deployment cancelled"; \ fi # Scaling operations scale-up: ## Add one GPU server @echo "๐Ÿ“ˆ Scaling up GPU servers..." python scripts/autoscaler.py --action=scale-up --count=1 @echo "โœ… Scale up initiated!" scale-down: ## Remove one GPU server @echo "๐Ÿ“‰ Scaling down GPU servers..." python scripts/autoscaler.py --action=scale-down --count=1 @echo "โœ… Scale down initiated!" # Monitoring and reporting cost-report: ## Generate cost analysis report @echo "๐Ÿ’ฐ Generating cost report..." python scripts/cost-analysis.py --format=markdown > reports/cost-report-$(shell date +%Y%m%d).md python scripts/cost-analysis.py --format=json > reports/cost-report-$(shell date +%Y%m%d).json @echo "โœ… Cost report generated in reports/" metrics: ## Show current infrastructure metrics @echo "๐Ÿ“Š Current Infrastructure Metrics" @echo "==================================" @python scripts/decision-metrics.py --summary status: ## Show infrastructure status @echo "๐Ÿ” Infrastructure Status" @echo "=======================" @cd $(TF_DIR) && terraform show -json | jq -r '.values.root_module.resources[] | select(.type | contains("hcloud")) | "\(.type): \(.values.name) - \(.values.status // "unknown")"' @echo "" @echo "๐Ÿ–ฅ๏ธ Server Health" @echo "===============" @cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m ping --one-line # Backup and recovery backup: ## Create infrastructure backup @echo "๐Ÿ’พ Creating infrastructure backup..." mkdir -p backups/$(shell date +%Y%m%d) cd $(TF_DIR) && terraform state pull > ../../../backups/$(shell date +%Y%m%d)/terraform-state-$(ENV).json cd $(ANSIBLE_DIR) && tar czf ../backups/$(shell date +%Y%m%d)/ansible-inventory-$(ENV).tar.gz inventory/ @echo "โœ… Backup created in backups/$(shell date +%Y%m%d)/" restore: ## Restore infrastructure from backup @echo "โš ๏ธ This will restore infrastructure from backup" @if [ -z "$(BACKUP_DATE)" ]; then \ echo "โŒ BACKUP_DATE required"; \ echo "Usage: make restore BACKUP_DATE=20241201"; \ exit 1; \ fi @if [ ! -d "backups/$(BACKUP_DATE)" ]; then \ echo "โŒ Backup directory backups/$(BACKUP_DATE) not found"; \ exit 1; \ fi @read -p "Restore from backup $(BACKUP_DATE)? [y/N] " -n 1 -r; \ echo; \ if [[ $$REPLY =~ ^[Yy]$$ ]]; then \ cd $(TF_DIR) && terraform state push ../../../backups/$(BACKUP_DATE)/terraform-state-$(ENV).json; \ echo "โœ… State restored from backup"; \ fi # Cleanup destroy: ## Destroy infrastructure (requires confirmation) @echo "๐Ÿ’ฅ This will DESTROY the $(ENV) infrastructure!" @echo "All servers, data, and configurations will be permanently deleted." @read -p "Type '$(ENV)-destroy-confirm' to proceed: " -r; \ if [[ "$$REPLY" == "$(ENV)-destroy-confirm" ]]; then \ cd $(TF_DIR) && terraform destroy; \ echo "๐Ÿ’ฅ Infrastructure destroyed!"; \ else \ echo "โŒ Destruction cancelled (incorrect confirmation)"; \ fi clean: ## Clean temporary files and caches @echo "๐Ÿงน Cleaning temporary files..." find . -name "*.tfplan" -delete find . -name ".terraform" -type d -exec rm -rf {} + find . -name "*.pyc" -delete find . -name "__pycache__" -type d -exec rm -rf {} + @echo "โœ… Cleanup complete!" # Development helpers dev-logs: ## Show logs from development environment @echo "๐Ÿ“‹ Development Environment Logs" @echo "==============================" cd $(ANSIBLE_DIR) && ansible gex44 -i inventory/dev.yml -m shell -a "journalctl -u vllm-api -n 50 --no-pager" dev-ssh: ## SSH to development GPU server @echo "๐Ÿ”Œ Connecting to development GPU server..." @SERVER_IP=$$(cd $(TF_DIR) && terraform output -json | jq -r '.gex44_ips.value[0]'); \ ssh -i ~/.ssh/hetzner_key ubuntu@$$SERVER_IP logs: ## Show logs from specified environment @if [ -z "$(SERVICE)" ]; then \ echo "๐Ÿ“‹ Available services: vllm-api, haproxy, prometheus, grafana"; \ echo "Usage: make logs SERVICE=vllm-api ENV=production"; \ exit 1; \ fi cd $(ANSIBLE_DIR) && ansible all -i inventory/$(ENV).yml -m shell -a "journalctl -u $(SERVICE) -n 50 --no-pager" # Documentation docs: ## Generate documentation @echo "๐Ÿ“š Generating documentation..." @command -v mkdocs >/dev/null 2>&1 || pip install mkdocs mkdocs build @echo "โœ… Documentation generated in site/" docs-serve: ## Serve documentation locally @echo "๐Ÿ“– Serving documentation at http://localhost:8000" mkdocs serve # CI/CD helpers ci-validate: ## Validation for CI pipeline @$(MAKE) validate @$(MAKE) test ci-deploy-staging: ## Deploy staging (for CI) @$(MAKE) deploy-staging ci-deploy-production: ## Deploy production (for CI) @$(MAKE) deploy-prod # Quick operations quick-status: ## Quick infrastructure overview @echo "โšก Quick Status Overview" @echo "======================" @echo "Environment: $(ENV)" @echo "Terraform state: $$(cd $(TF_DIR) && terraform show -json 2>/dev/null | jq -r '.values.root_module.resources | length // "No resources"') resources" @python -c "import requests; print('API Health:', 'OK' if requests.get('$(API_URL)/health', timeout=5).status_code == 200 else 'FAIL')" 2>/dev/null || echo "API Health: Unknown (set API_URL)" @echo "Last backup: $$(ls -1t backups/ | head -1 || echo 'No backups')" emergency-scale: ## Emergency scale up (bypasses normal limits) @echo "๐Ÿšจ EMERGENCY SCALE UP" @echo "This will immediately order new GPU servers" @read -p "Number of servers to add [1-5]: " -n 1 -r; \ echo; \ if [[ $$REPLY =~ ^[1-5]$$ ]]; then \ python scripts/autoscaler.py --action=emergency-scale --count=$$REPLY; \ echo "๐Ÿšจ Emergency scale initiated for $$REPLY servers"; \ else \ echo "โŒ Invalid server count"; \ fi # Environment info env-info: ## Show environment configuration @echo "๐Ÿ” Environment Information" @echo "=========================" @echo "Current Environment: $(ENV)" @echo "Terraform Directory: $(TF_DIR)" @echo "Ansible Directory: $(ANSIBLE_DIR)" @echo "" @echo "Required Environment Variables:" @echo "------------------------------" @echo "HCLOUD_TOKEN: $$([ -n "$$HCLOUD_TOKEN" ] && echo "โœ… Set" || echo "โŒ Missing")" @echo "ROBOT_API_USER: $$([ -n "$$ROBOT_API_USER" ] && echo "โœ… Set" || echo "โŒ Missing")" @echo "ROBOT_API_PASSWORD: $$([ -n "$$ROBOT_API_PASSWORD" ] && echo "โœ… Set" || echo "โŒ Missing")" @echo "API_URL: $$([ -n "$$API_URL" ] && echo "โœ… Set ($$API_URL)" || echo "โŒ Missing")"