2025-09-13 14:18:28 +02:00

504 lines
13 KiB
YAML

# GitLab CI/CD Pipeline for AI Infrastructure
# Production-ready pipeline with comprehensive testing and deployment
stages:
- validate
- test
- security
- deploy-staging
- integration-test
- deploy-production
- post-deploy
variables:
TF_ROOT: terraform
ANSIBLE_ROOT: ansible
TF_VERSION: "1.6.0"
ANSIBLE_VERSION: "8.5.0"
PYTHON_VERSION: "3.11"
GO_VERSION: "1.21"
# Terraform state configuration
TF_STATE_NAME: ai-infrastructure
TF_CACHE_KEY: "$CI_COMMIT_REF_SLUG"
# Security scanning
SECURITY_SCAN_ENABLED: "true"
# Performance testing
LOAD_TEST_ENABLED: "true"
# Deployment settings
DEPLOY_TIMEOUT: "1800" # 30 minutes
# Templates for reusability
.terraform_base: &terraform_base
image: hashicorp/terraform:$TF_VERSION
before_script:
- cd $TF_ROOT
- terraform --version
- |
cat << EOF > backend.tf
terraform {
backend "http" {
address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME"
lock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock"
unlock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock"
username = "gitlab-ci-token"
password = "$CI_JOB_TOKEN"
lock_method = "POST"
unlock_method = "DELETE"
retry_wait_min = 5
}
}
EOF
- terraform init
.ansible_base: &ansible_base
image: quay.io/ansible/ansible-runner:latest
before_script:
- cd $ANSIBLE_ROOT
- ansible --version
- ansible-galaxy install -r requirements.yml
- echo "$ANSIBLE_VAULT_PASSWORD" > /tmp/.vault-pass
- chmod 600 /tmp/.vault-pass
.docker_base: &docker_base
image: docker:latest
services:
- docker:dind
variables:
DOCKER_HOST: tcp://docker:2376
DOCKER_TLS_CERTDIR: "/certs"
# Cache configurations
.terraform_cache: &terraform_cache
cache:
key: terraform-$CI_COMMIT_REF_SLUG
paths:
- $TF_ROOT/.terraform/
- $TF_ROOT/.terraform.lock.hcl
.ansible_cache: &ansible_cache
cache:
key: ansible-$CI_COMMIT_REF_SLUG
paths:
- $ANSIBLE_ROOT/collections/
- $ANSIBLE_ROOT/roles/
# ================================
# VALIDATION STAGE
# ================================
terraform_format_check:
<<: *terraform_base
<<: *terraform_cache
stage: validate
script:
- terraform fmt -check=true -recursive
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- if: $CI_COMMIT_BRANCH == "main"
terraform_validate:
<<: *terraform_base
<<: *terraform_cache
stage: validate
script:
- cd environments/dev
- terraform validate
- cd ../staging
- terraform validate
- cd ../production
- terraform validate
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- if: $CI_COMMIT_BRANCH == "main"
ansible_syntax_check:
<<: *ansible_base
<<: *ansible_cache
stage: validate
script:
- ansible-playbook --syntax-check playbooks/site.yml
- ansible-playbook --syntax-check playbooks/gex44-setup.yml
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- if: $CI_COMMIT_BRANCH == "main"
ansible_lint:
<<: *ansible_base
<<: *ansible_cache
stage: validate
script:
- ansible-lint playbooks/ || true # Non-blocking for now
allow_failure: true
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- if: $CI_COMMIT_BRANCH == "main"
yaml_lint:
image: python:$PYTHON_VERSION-slim
stage: validate
before_script:
- pip install yamllint
script:
- yamllint .gitlab-ci.yml
- yamllint ansible/
- yamllint monitoring/
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- if: $CI_COMMIT_BRANCH == "main"
# ================================
# TEST STAGE
# ================================
terraform_test:
image: golang:$GO_VERSION
stage: test
before_script:
- cd tests/terraform
- go mod download
script:
- go test -v -timeout 30m ./...
artifacts:
reports:
junit: tests/terraform/test-results.xml
rules:
- if: $CI_COMMIT_BRANCH == "main"
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
ansible_molecule_test:
<<: *docker_base
<<: *ansible_cache
stage: test
before_script:
- apk add --no-cache python3 py3-pip
- pip3 install ansible molecule[docker] docker
- cd $ANSIBLE_ROOT
script:
- cd roles/vllm && molecule test
- cd ../cuda && molecule test
artifacts:
reports:
junit: ansible/molecule/test-results.xml
rules:
- if: $CI_COMMIT_BRANCH == "main"
python_unit_tests:
image: python:$PYTHON_VERSION
stage: test
before_script:
- pip install -r tests/requirements.txt
script:
- python -m pytest tests/unit/ -v --junitxml=test-results.xml
artifacts:
reports:
junit: test-results.xml
rules:
- if: $CI_COMMIT_BRANCH == "main"
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
# ================================
# SECURITY STAGE
# ================================
terraform_security_scan:
image: bridgecrew/checkov:latest
stage: security
script:
- checkov -d terraform/ --framework terraform --output junitxml --output-file-path checkov-results.xml
artifacts:
reports:
junit: checkov-results.xml
allow_failure: true
rules:
- if: $SECURITY_SCAN_ENABLED == "true"
ansible_security_scan:
image: quay.io/ansible/ansible-lint:latest
stage: security
script:
- ansible-lint ansible/playbooks/ --format sarif --output ansible-security.sarif
artifacts:
reports:
sast: ansible-security.sarif
allow_failure: true
rules:
- if: $SECURITY_SCAN_ENABLED == "true"
secret_detection:
image: gitguardian/ggshield:latest
stage: security
script:
- ggshield secret scan path .
allow_failure: true
rules:
- if: $SECURITY_SCAN_ENABLED == "true"
# ================================
# STAGING DEPLOYMENT
# ================================
deploy_staging_infrastructure:
<<: *terraform_base
<<: *terraform_cache
stage: deploy-staging
environment:
name: staging
url: https://api-staging.${CI_PROJECT_NAME}.com
deployment_tier: staging
script:
- cd environments/staging
- terraform plan -out=staging.tfplan
- terraform apply -auto-approve staging.tfplan
artifacts:
name: staging-infrastructure
paths:
- $TF_ROOT/environments/staging/staging.tfplan
expire_in: 1 week
rules:
- if: $CI_COMMIT_BRANCH == "main"
timeout: 30m
configure_staging_servers:
<<: *ansible_base
<<: *ansible_cache
stage: deploy-staging
environment:
name: staging
needs: ["deploy_staging_infrastructure"]
script:
- ansible-playbook -i inventory/staging.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass
artifacts:
name: staging-configuration
paths:
- $ANSIBLE_ROOT/logs/
expire_in: 1 week
rules:
- if: $CI_COMMIT_BRANCH == "main"
timeout: 45m
# ================================
# INTEGRATION TESTS
# ================================
api_contract_tests:
image: python:$PYTHON_VERSION
stage: integration-test
needs: ["configure_staging_servers"]
before_script:
- pip install -r tests/contracts/requirements.txt
script:
- python tests/contracts/test_inference_api.py --api-url="$STAGING_API_URL"
artifacts:
reports:
junit: tests/contracts/test-results.xml
rules:
- if: $CI_COMMIT_BRANCH == "main"
load_test:
image: grafana/k6:latest
stage: integration-test
needs: ["configure_staging_servers"]
script:
- k6 run tests/load/k6_inference_test.js --env API_URL="$STAGING_API_URL"
artifacts:
reports:
performance: tests/load/k6-report.json
rules:
- if: $LOAD_TEST_ENABLED == "true" && $CI_COMMIT_BRANCH == "main"
end_to_end_test:
image: python:$PYTHON_VERSION
stage: integration-test
needs: ["configure_staging_servers"]
before_script:
- pip install requests pytest
script:
- python tests/integration/e2e_test.py --staging-url="$STAGING_API_URL"
artifacts:
reports:
junit: tests/integration/e2e-results.xml
rules:
- if: $CI_COMMIT_BRANCH == "main"
# ================================
# PRODUCTION DEPLOYMENT
# ================================
deploy_production_infrastructure:
<<: *terraform_base
<<: *terraform_cache
stage: deploy-production
environment:
name: production
url: https://api.${CI_PROJECT_NAME}.com
deployment_tier: production
script:
- cd environments/production
- terraform plan -out=production.tfplan
- terraform apply -auto-approve production.tfplan
artifacts:
name: production-infrastructure
paths:
- $TF_ROOT/environments/production/production.tfplan
expire_in: 1 month
rules:
- if: $CI_COMMIT_BRANCH == "main"
when: manual
allow_failure: false
timeout: 30m
configure_production_servers:
<<: *ansible_base
<<: *ansible_cache
stage: deploy-production
environment:
name: production
needs: ["deploy_production_infrastructure"]
script:
- ansible-playbook -i inventory/production.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass
artifacts:
name: production-configuration
paths:
- $ANSIBLE_ROOT/logs/
expire_in: 1 month
rules:
- if: $CI_COMMIT_BRANCH == "main"
when: manual
timeout: 45m
# ================================
# POST-DEPLOYMENT
# ================================
production_smoke_tests:
image: curlimages/curl:latest
stage: post-deploy
needs: ["configure_production_servers"]
script:
- |
echo "Running smoke tests against production..."
# Health check
curl -f "$PRODUCTION_API_URL/health" || exit 1
echo "✓ Health check passed"
# Models endpoint
curl -f "$PRODUCTION_API_URL/v1/models" || exit 1
echo "✓ Models endpoint accessible"
# Metrics endpoint (internal)
curl -f "$PRODUCTION_API_URL/metrics" || exit 1
echo "✓ Metrics endpoint accessible"
# Monitoring dashboard
curl -f "$PRODUCTION_MONITORING_URL" || exit 1
echo "✓ Monitoring dashboard accessible"
echo "All smoke tests passed!"
rules:
- if: $CI_COMMIT_BRANCH == "main"
when: manual
performance_baseline:
image: grafana/k6:latest
stage: post-deploy
needs: ["configure_production_servers"]
script:
- k6 run tests/load/baseline_test.js --env API_URL="$PRODUCTION_API_URL"
artifacts:
reports:
performance: tests/load/baseline-report.json
rules:
- if: $CI_COMMIT_BRANCH == "main"
when: manual
cost_analysis:
image: python:$PYTHON_VERSION
stage: post-deploy
before_script:
- pip install hcloud python-dateutil jinja2
script:
- python scripts/cost-analysis.py --environment=production --format=json > cost-report.json
- python scripts/cost-analysis.py --environment=production --format=markdown > cost-report.md
artifacts:
name: cost-analysis-$CI_COMMIT_SHORT_SHA
paths:
- cost-report.json
- cost-report.md
expire_in: 1 month
rules:
- if: $CI_COMMIT_BRANCH == "main"
when: manual
# ================================
# CLEANUP AND UTILITIES
# ================================
destroy_staging:
<<: *terraform_base
stage: deploy-staging
environment:
name: staging
action: stop
script:
- cd environments/staging
- terraform destroy -auto-approve
rules:
- if: $CI_PIPELINE_SOURCE == "web"
when: manual
- if: $CI_COMMIT_BRANCH != "main"
when: manual
# ================================
# SCHEDULED JOBS
# ================================
nightly_full_test:
extends: .terraform_test
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "nightly"
parallel:
matrix:
- ENVIRONMENT: [staging, production]
weekly_security_scan:
extends: terraform_security_scan
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "weekly"
# ================================
# DEPLOYMENT NOTIFICATIONS
# ================================
notify_deployment_success:
image: curlimages/curl:latest
stage: post-deploy
needs: ["production_smoke_tests"]
script:
- |
if [ -n "$SLACK_WEBHOOK_URL" ]; then
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"🚀 Production deployment successful for commit $CI_COMMIT_SHORT_SHA\"}" \
"$SLACK_WEBHOOK_URL"
fi
rules:
- if: $CI_COMMIT_BRANCH == "main"
when: on_success
notify_deployment_failure:
image: curlimages/curl:latest
stage: post-deploy
script:
- |
if [ -n "$SLACK_WEBHOOK_URL" ]; then
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"❌ Production deployment failed for commit $CI_COMMIT_SHORT_SHA. Check pipeline: $CI_PIPELINE_URL\"}" \
"$SLACK_WEBHOOK_URL"
fi
rules:
- if: $CI_COMMIT_BRANCH == "main"
when: on_failure