# GitLab CI/CD Pipeline for AI Infrastructure # Production-ready pipeline with comprehensive testing and deployment stages: - validate - test - security - deploy-staging - integration-test - deploy-production - post-deploy variables: TF_ROOT: terraform ANSIBLE_ROOT: ansible TF_VERSION: "1.6.0" ANSIBLE_VERSION: "8.5.0" PYTHON_VERSION: "3.11" GO_VERSION: "1.21" # Terraform state configuration TF_STATE_NAME: ai-infrastructure TF_CACHE_KEY: "$CI_COMMIT_REF_SLUG" # Security scanning SECURITY_SCAN_ENABLED: "true" # Performance testing LOAD_TEST_ENABLED: "true" # Deployment settings DEPLOY_TIMEOUT: "1800" # 30 minutes # Templates for reusability .terraform_base: &terraform_base image: hashicorp/terraform:$TF_VERSION before_script: - cd $TF_ROOT - terraform --version - | cat << EOF > backend.tf terraform { backend "http" { address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME" lock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock" unlock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock" username = "gitlab-ci-token" password = "$CI_JOB_TOKEN" lock_method = "POST" unlock_method = "DELETE" retry_wait_min = 5 } } EOF - terraform init .ansible_base: &ansible_base image: quay.io/ansible/ansible-runner:latest before_script: - cd $ANSIBLE_ROOT - ansible --version - ansible-galaxy install -r requirements.yml - echo "$ANSIBLE_VAULT_PASSWORD" > /tmp/.vault-pass - chmod 600 /tmp/.vault-pass .docker_base: &docker_base image: docker:latest services: - docker:dind variables: DOCKER_HOST: tcp://docker:2376 DOCKER_TLS_CERTDIR: "/certs" # Cache configurations .terraform_cache: &terraform_cache cache: key: terraform-$CI_COMMIT_REF_SLUG paths: - $TF_ROOT/.terraform/ - $TF_ROOT/.terraform.lock.hcl .ansible_cache: &ansible_cache cache: key: ansible-$CI_COMMIT_REF_SLUG paths: - $ANSIBLE_ROOT/collections/ - $ANSIBLE_ROOT/roles/ # ================================ # VALIDATION STAGE # ================================ terraform_format_check: <<: *terraform_base <<: *terraform_cache stage: validate script: - terraform fmt -check=true -recursive rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" - if: $CI_COMMIT_BRANCH == "main" terraform_validate: <<: *terraform_base <<: *terraform_cache stage: validate script: - cd environments/dev - terraform validate - cd ../staging - terraform validate - cd ../production - terraform validate rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" - if: $CI_COMMIT_BRANCH == "main" ansible_syntax_check: <<: *ansible_base <<: *ansible_cache stage: validate script: - ansible-playbook --syntax-check playbooks/site.yml - ansible-playbook --syntax-check playbooks/gex44-setup.yml rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" - if: $CI_COMMIT_BRANCH == "main" ansible_lint: <<: *ansible_base <<: *ansible_cache stage: validate script: - ansible-lint playbooks/ || true # Non-blocking for now allow_failure: true rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" - if: $CI_COMMIT_BRANCH == "main" yaml_lint: image: python:$PYTHON_VERSION-slim stage: validate before_script: - pip install yamllint script: - yamllint .gitlab-ci.yml - yamllint ansible/ - yamllint monitoring/ rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" - if: $CI_COMMIT_BRANCH == "main" # ================================ # TEST STAGE # ================================ terraform_test: image: golang:$GO_VERSION stage: test before_script: - cd tests/terraform - go mod download script: - go test -v -timeout 30m ./... artifacts: reports: junit: tests/terraform/test-results.xml rules: - if: $CI_COMMIT_BRANCH == "main" - if: $CI_PIPELINE_SOURCE == "merge_request_event" ansible_molecule_test: <<: *docker_base <<: *ansible_cache stage: test before_script: - apk add --no-cache python3 py3-pip - pip3 install ansible molecule[docker] docker - cd $ANSIBLE_ROOT script: - cd roles/vllm && molecule test - cd ../cuda && molecule test artifacts: reports: junit: ansible/molecule/test-results.xml rules: - if: $CI_COMMIT_BRANCH == "main" python_unit_tests: image: python:$PYTHON_VERSION stage: test before_script: - pip install -r tests/requirements.txt script: - python -m pytest tests/unit/ -v --junitxml=test-results.xml artifacts: reports: junit: test-results.xml rules: - if: $CI_COMMIT_BRANCH == "main" - if: $CI_PIPELINE_SOURCE == "merge_request_event" # ================================ # SECURITY STAGE # ================================ terraform_security_scan: image: bridgecrew/checkov:latest stage: security script: - checkov -d terraform/ --framework terraform --output junitxml --output-file-path checkov-results.xml artifacts: reports: junit: checkov-results.xml allow_failure: true rules: - if: $SECURITY_SCAN_ENABLED == "true" ansible_security_scan: image: quay.io/ansible/ansible-lint:latest stage: security script: - ansible-lint ansible/playbooks/ --format sarif --output ansible-security.sarif artifacts: reports: sast: ansible-security.sarif allow_failure: true rules: - if: $SECURITY_SCAN_ENABLED == "true" secret_detection: image: gitguardian/ggshield:latest stage: security script: - ggshield secret scan path . allow_failure: true rules: - if: $SECURITY_SCAN_ENABLED == "true" # ================================ # STAGING DEPLOYMENT # ================================ deploy_staging_infrastructure: <<: *terraform_base <<: *terraform_cache stage: deploy-staging environment: name: staging url: https://api-staging.${CI_PROJECT_NAME}.com deployment_tier: staging script: - cd environments/staging - terraform plan -out=staging.tfplan - terraform apply -auto-approve staging.tfplan artifacts: name: staging-infrastructure paths: - $TF_ROOT/environments/staging/staging.tfplan expire_in: 1 week rules: - if: $CI_COMMIT_BRANCH == "main" timeout: 30m configure_staging_servers: <<: *ansible_base <<: *ansible_cache stage: deploy-staging environment: name: staging needs: ["deploy_staging_infrastructure"] script: - ansible-playbook -i inventory/staging.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass artifacts: name: staging-configuration paths: - $ANSIBLE_ROOT/logs/ expire_in: 1 week rules: - if: $CI_COMMIT_BRANCH == "main" timeout: 45m # ================================ # INTEGRATION TESTS # ================================ api_contract_tests: image: python:$PYTHON_VERSION stage: integration-test needs: ["configure_staging_servers"] before_script: - pip install -r tests/contracts/requirements.txt script: - python tests/contracts/test_inference_api.py --api-url="$STAGING_API_URL" artifacts: reports: junit: tests/contracts/test-results.xml rules: - if: $CI_COMMIT_BRANCH == "main" load_test: image: grafana/k6:latest stage: integration-test needs: ["configure_staging_servers"] script: - k6 run tests/load/k6_inference_test.js --env API_URL="$STAGING_API_URL" artifacts: reports: performance: tests/load/k6-report.json rules: - if: $LOAD_TEST_ENABLED == "true" && $CI_COMMIT_BRANCH == "main" end_to_end_test: image: python:$PYTHON_VERSION stage: integration-test needs: ["configure_staging_servers"] before_script: - pip install requests pytest script: - python tests/integration/e2e_test.py --staging-url="$STAGING_API_URL" artifacts: reports: junit: tests/integration/e2e-results.xml rules: - if: $CI_COMMIT_BRANCH == "main" # ================================ # PRODUCTION DEPLOYMENT # ================================ deploy_production_infrastructure: <<: *terraform_base <<: *terraform_cache stage: deploy-production environment: name: production url: https://api.${CI_PROJECT_NAME}.com deployment_tier: production script: - cd environments/production - terraform plan -out=production.tfplan - terraform apply -auto-approve production.tfplan artifacts: name: production-infrastructure paths: - $TF_ROOT/environments/production/production.tfplan expire_in: 1 month rules: - if: $CI_COMMIT_BRANCH == "main" when: manual allow_failure: false timeout: 30m configure_production_servers: <<: *ansible_base <<: *ansible_cache stage: deploy-production environment: name: production needs: ["deploy_production_infrastructure"] script: - ansible-playbook -i inventory/production.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass artifacts: name: production-configuration paths: - $ANSIBLE_ROOT/logs/ expire_in: 1 month rules: - if: $CI_COMMIT_BRANCH == "main" when: manual timeout: 45m # ================================ # POST-DEPLOYMENT # ================================ production_smoke_tests: image: curlimages/curl:latest stage: post-deploy needs: ["configure_production_servers"] script: - | echo "Running smoke tests against production..." # Health check curl -f "$PRODUCTION_API_URL/health" || exit 1 echo "✓ Health check passed" # Models endpoint curl -f "$PRODUCTION_API_URL/v1/models" || exit 1 echo "✓ Models endpoint accessible" # Metrics endpoint (internal) curl -f "$PRODUCTION_API_URL/metrics" || exit 1 echo "✓ Metrics endpoint accessible" # Monitoring dashboard curl -f "$PRODUCTION_MONITORING_URL" || exit 1 echo "✓ Monitoring dashboard accessible" echo "All smoke tests passed!" rules: - if: $CI_COMMIT_BRANCH == "main" when: manual performance_baseline: image: grafana/k6:latest stage: post-deploy needs: ["configure_production_servers"] script: - k6 run tests/load/baseline_test.js --env API_URL="$PRODUCTION_API_URL" artifacts: reports: performance: tests/load/baseline-report.json rules: - if: $CI_COMMIT_BRANCH == "main" when: manual cost_analysis: image: python:$PYTHON_VERSION stage: post-deploy before_script: - pip install hcloud python-dateutil jinja2 script: - python scripts/cost-analysis.py --environment=production --format=json > cost-report.json - python scripts/cost-analysis.py --environment=production --format=markdown > cost-report.md artifacts: name: cost-analysis-$CI_COMMIT_SHORT_SHA paths: - cost-report.json - cost-report.md expire_in: 1 month rules: - if: $CI_COMMIT_BRANCH == "main" when: manual # ================================ # CLEANUP AND UTILITIES # ================================ destroy_staging: <<: *terraform_base stage: deploy-staging environment: name: staging action: stop script: - cd environments/staging - terraform destroy -auto-approve rules: - if: $CI_PIPELINE_SOURCE == "web" when: manual - if: $CI_COMMIT_BRANCH != "main" when: manual # ================================ # SCHEDULED JOBS # ================================ nightly_full_test: extends: .terraform_test rules: - if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "nightly" parallel: matrix: - ENVIRONMENT: [staging, production] weekly_security_scan: extends: terraform_security_scan rules: - if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "weekly" # ================================ # DEPLOYMENT NOTIFICATIONS # ================================ notify_deployment_success: image: curlimages/curl:latest stage: post-deploy needs: ["production_smoke_tests"] script: - | if [ -n "$SLACK_WEBHOOK_URL" ]; then curl -X POST -H 'Content-type: application/json' \ --data "{\"text\":\"🚀 Production deployment successful for commit $CI_COMMIT_SHORT_SHA\"}" \ "$SLACK_WEBHOOK_URL" fi rules: - if: $CI_COMMIT_BRANCH == "main" when: on_success notify_deployment_failure: image: curlimages/curl:latest stage: post-deploy script: - | if [ -n "$SLACK_WEBHOOK_URL" ]; then curl -X POST -H 'Content-type: application/json' \ --data "{\"text\":\"❌ Production deployment failed for commit $CI_COMMIT_SHORT_SHA. Check pipeline: $CI_PIPELINE_URL\"}" \ "$SLACK_WEBHOOK_URL" fi rules: - if: $CI_COMMIT_BRANCH == "main" when: on_failure