504 lines
13 KiB
YAML
504 lines
13 KiB
YAML
# GitLab CI/CD Pipeline for AI Infrastructure
|
|
# Production-ready pipeline with comprehensive testing and deployment
|
|
|
|
stages:
|
|
- validate
|
|
- test
|
|
- security
|
|
- deploy-staging
|
|
- integration-test
|
|
- deploy-production
|
|
- post-deploy
|
|
|
|
variables:
|
|
TF_ROOT: terraform
|
|
ANSIBLE_ROOT: ansible
|
|
TF_VERSION: "1.6.0"
|
|
ANSIBLE_VERSION: "8.5.0"
|
|
PYTHON_VERSION: "3.11"
|
|
GO_VERSION: "1.21"
|
|
|
|
# Terraform state configuration
|
|
TF_STATE_NAME: ai-infrastructure
|
|
TF_CACHE_KEY: "$CI_COMMIT_REF_SLUG"
|
|
|
|
# Security scanning
|
|
SECURITY_SCAN_ENABLED: "true"
|
|
|
|
# Performance testing
|
|
LOAD_TEST_ENABLED: "true"
|
|
|
|
# Deployment settings
|
|
DEPLOY_TIMEOUT: "1800" # 30 minutes
|
|
|
|
# Templates for reusability
|
|
.terraform_base: &terraform_base
|
|
image: hashicorp/terraform:$TF_VERSION
|
|
before_script:
|
|
- cd $TF_ROOT
|
|
- terraform --version
|
|
- |
|
|
cat << EOF > backend.tf
|
|
terraform {
|
|
backend "http" {
|
|
address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME"
|
|
lock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock"
|
|
unlock_address = "$CI_API_V4_URL/projects/$CI_PROJECT_ID/terraform/state/$TF_STATE_NAME/lock"
|
|
username = "gitlab-ci-token"
|
|
password = "$CI_JOB_TOKEN"
|
|
lock_method = "POST"
|
|
unlock_method = "DELETE"
|
|
retry_wait_min = 5
|
|
}
|
|
}
|
|
EOF
|
|
- terraform init
|
|
|
|
.ansible_base: &ansible_base
|
|
image: quay.io/ansible/ansible-runner:latest
|
|
before_script:
|
|
- cd $ANSIBLE_ROOT
|
|
- ansible --version
|
|
- ansible-galaxy install -r requirements.yml
|
|
- echo "$ANSIBLE_VAULT_PASSWORD" > /tmp/.vault-pass
|
|
- chmod 600 /tmp/.vault-pass
|
|
|
|
.docker_base: &docker_base
|
|
image: docker:latest
|
|
services:
|
|
- docker:dind
|
|
variables:
|
|
DOCKER_HOST: tcp://docker:2376
|
|
DOCKER_TLS_CERTDIR: "/certs"
|
|
|
|
# Cache configurations
|
|
.terraform_cache: &terraform_cache
|
|
cache:
|
|
key: terraform-$CI_COMMIT_REF_SLUG
|
|
paths:
|
|
- $TF_ROOT/.terraform/
|
|
- $TF_ROOT/.terraform.lock.hcl
|
|
|
|
.ansible_cache: &ansible_cache
|
|
cache:
|
|
key: ansible-$CI_COMMIT_REF_SLUG
|
|
paths:
|
|
- $ANSIBLE_ROOT/collections/
|
|
- $ANSIBLE_ROOT/roles/
|
|
|
|
# ================================
|
|
# VALIDATION STAGE
|
|
# ================================
|
|
|
|
terraform_format_check:
|
|
<<: *terraform_base
|
|
<<: *terraform_cache
|
|
stage: validate
|
|
script:
|
|
- terraform fmt -check=true -recursive
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
terraform_validate:
|
|
<<: *terraform_base
|
|
<<: *terraform_cache
|
|
stage: validate
|
|
script:
|
|
- cd environments/dev
|
|
- terraform validate
|
|
- cd ../staging
|
|
- terraform validate
|
|
- cd ../production
|
|
- terraform validate
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
ansible_syntax_check:
|
|
<<: *ansible_base
|
|
<<: *ansible_cache
|
|
stage: validate
|
|
script:
|
|
- ansible-playbook --syntax-check playbooks/site.yml
|
|
- ansible-playbook --syntax-check playbooks/gex44-setup.yml
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
ansible_lint:
|
|
<<: *ansible_base
|
|
<<: *ansible_cache
|
|
stage: validate
|
|
script:
|
|
- ansible-lint playbooks/ || true # Non-blocking for now
|
|
allow_failure: true
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
yaml_lint:
|
|
image: python:$PYTHON_VERSION-slim
|
|
stage: validate
|
|
before_script:
|
|
- pip install yamllint
|
|
script:
|
|
- yamllint .gitlab-ci.yml
|
|
- yamllint ansible/
|
|
- yamllint monitoring/
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
# ================================
|
|
# TEST STAGE
|
|
# ================================
|
|
|
|
terraform_test:
|
|
image: golang:$GO_VERSION
|
|
stage: test
|
|
before_script:
|
|
- cd tests/terraform
|
|
- go mod download
|
|
script:
|
|
- go test -v -timeout 30m ./...
|
|
artifacts:
|
|
reports:
|
|
junit: tests/terraform/test-results.xml
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
|
|
|
ansible_molecule_test:
|
|
<<: *docker_base
|
|
<<: *ansible_cache
|
|
stage: test
|
|
before_script:
|
|
- apk add --no-cache python3 py3-pip
|
|
- pip3 install ansible molecule[docker] docker
|
|
- cd $ANSIBLE_ROOT
|
|
script:
|
|
- cd roles/vllm && molecule test
|
|
- cd ../cuda && molecule test
|
|
artifacts:
|
|
reports:
|
|
junit: ansible/molecule/test-results.xml
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
python_unit_tests:
|
|
image: python:$PYTHON_VERSION
|
|
stage: test
|
|
before_script:
|
|
- pip install -r tests/requirements.txt
|
|
script:
|
|
- python -m pytest tests/unit/ -v --junitxml=test-results.xml
|
|
artifacts:
|
|
reports:
|
|
junit: test-results.xml
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
|
|
|
# ================================
|
|
# SECURITY STAGE
|
|
# ================================
|
|
|
|
terraform_security_scan:
|
|
image: bridgecrew/checkov:latest
|
|
stage: security
|
|
script:
|
|
- checkov -d terraform/ --framework terraform --output junitxml --output-file-path checkov-results.xml
|
|
artifacts:
|
|
reports:
|
|
junit: checkov-results.xml
|
|
allow_failure: true
|
|
rules:
|
|
- if: $SECURITY_SCAN_ENABLED == "true"
|
|
|
|
ansible_security_scan:
|
|
image: quay.io/ansible/ansible-lint:latest
|
|
stage: security
|
|
script:
|
|
- ansible-lint ansible/playbooks/ --format sarif --output ansible-security.sarif
|
|
artifacts:
|
|
reports:
|
|
sast: ansible-security.sarif
|
|
allow_failure: true
|
|
rules:
|
|
- if: $SECURITY_SCAN_ENABLED == "true"
|
|
|
|
secret_detection:
|
|
image: gitguardian/ggshield:latest
|
|
stage: security
|
|
script:
|
|
- ggshield secret scan path .
|
|
allow_failure: true
|
|
rules:
|
|
- if: $SECURITY_SCAN_ENABLED == "true"
|
|
|
|
# ================================
|
|
# STAGING DEPLOYMENT
|
|
# ================================
|
|
|
|
deploy_staging_infrastructure:
|
|
<<: *terraform_base
|
|
<<: *terraform_cache
|
|
stage: deploy-staging
|
|
environment:
|
|
name: staging
|
|
url: https://api-staging.${CI_PROJECT_NAME}.com
|
|
deployment_tier: staging
|
|
script:
|
|
- cd environments/staging
|
|
- terraform plan -out=staging.tfplan
|
|
- terraform apply -auto-approve staging.tfplan
|
|
artifacts:
|
|
name: staging-infrastructure
|
|
paths:
|
|
- $TF_ROOT/environments/staging/staging.tfplan
|
|
expire_in: 1 week
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
timeout: 30m
|
|
|
|
configure_staging_servers:
|
|
<<: *ansible_base
|
|
<<: *ansible_cache
|
|
stage: deploy-staging
|
|
environment:
|
|
name: staging
|
|
needs: ["deploy_staging_infrastructure"]
|
|
script:
|
|
- ansible-playbook -i inventory/staging.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass
|
|
artifacts:
|
|
name: staging-configuration
|
|
paths:
|
|
- $ANSIBLE_ROOT/logs/
|
|
expire_in: 1 week
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
timeout: 45m
|
|
|
|
# ================================
|
|
# INTEGRATION TESTS
|
|
# ================================
|
|
|
|
api_contract_tests:
|
|
image: python:$PYTHON_VERSION
|
|
stage: integration-test
|
|
needs: ["configure_staging_servers"]
|
|
before_script:
|
|
- pip install -r tests/contracts/requirements.txt
|
|
script:
|
|
- python tests/contracts/test_inference_api.py --api-url="$STAGING_API_URL"
|
|
artifacts:
|
|
reports:
|
|
junit: tests/contracts/test-results.xml
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
load_test:
|
|
image: grafana/k6:latest
|
|
stage: integration-test
|
|
needs: ["configure_staging_servers"]
|
|
script:
|
|
- k6 run tests/load/k6_inference_test.js --env API_URL="$STAGING_API_URL"
|
|
artifacts:
|
|
reports:
|
|
performance: tests/load/k6-report.json
|
|
rules:
|
|
- if: $LOAD_TEST_ENABLED == "true" && $CI_COMMIT_BRANCH == "main"
|
|
|
|
end_to_end_test:
|
|
image: python:$PYTHON_VERSION
|
|
stage: integration-test
|
|
needs: ["configure_staging_servers"]
|
|
before_script:
|
|
- pip install requests pytest
|
|
script:
|
|
- python tests/integration/e2e_test.py --staging-url="$STAGING_API_URL"
|
|
artifacts:
|
|
reports:
|
|
junit: tests/integration/e2e-results.xml
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
|
|
# ================================
|
|
# PRODUCTION DEPLOYMENT
|
|
# ================================
|
|
|
|
deploy_production_infrastructure:
|
|
<<: *terraform_base
|
|
<<: *terraform_cache
|
|
stage: deploy-production
|
|
environment:
|
|
name: production
|
|
url: https://api.${CI_PROJECT_NAME}.com
|
|
deployment_tier: production
|
|
script:
|
|
- cd environments/production
|
|
- terraform plan -out=production.tfplan
|
|
- terraform apply -auto-approve production.tfplan
|
|
artifacts:
|
|
name: production-infrastructure
|
|
paths:
|
|
- $TF_ROOT/environments/production/production.tfplan
|
|
expire_in: 1 month
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
when: manual
|
|
allow_failure: false
|
|
timeout: 30m
|
|
|
|
configure_production_servers:
|
|
<<: *ansible_base
|
|
<<: *ansible_cache
|
|
stage: deploy-production
|
|
environment:
|
|
name: production
|
|
needs: ["deploy_production_infrastructure"]
|
|
script:
|
|
- ansible-playbook -i inventory/production.yml playbooks/site.yml --vault-password-file /tmp/.vault-pass
|
|
artifacts:
|
|
name: production-configuration
|
|
paths:
|
|
- $ANSIBLE_ROOT/logs/
|
|
expire_in: 1 month
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
when: manual
|
|
timeout: 45m
|
|
|
|
# ================================
|
|
# POST-DEPLOYMENT
|
|
# ================================
|
|
|
|
production_smoke_tests:
|
|
image: curlimages/curl:latest
|
|
stage: post-deploy
|
|
needs: ["configure_production_servers"]
|
|
script:
|
|
- |
|
|
echo "Running smoke tests against production..."
|
|
|
|
# Health check
|
|
curl -f "$PRODUCTION_API_URL/health" || exit 1
|
|
echo "✓ Health check passed"
|
|
|
|
# Models endpoint
|
|
curl -f "$PRODUCTION_API_URL/v1/models" || exit 1
|
|
echo "✓ Models endpoint accessible"
|
|
|
|
# Metrics endpoint (internal)
|
|
curl -f "$PRODUCTION_API_URL/metrics" || exit 1
|
|
echo "✓ Metrics endpoint accessible"
|
|
|
|
# Monitoring dashboard
|
|
curl -f "$PRODUCTION_MONITORING_URL" || exit 1
|
|
echo "✓ Monitoring dashboard accessible"
|
|
|
|
echo "All smoke tests passed!"
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
when: manual
|
|
|
|
performance_baseline:
|
|
image: grafana/k6:latest
|
|
stage: post-deploy
|
|
needs: ["configure_production_servers"]
|
|
script:
|
|
- k6 run tests/load/baseline_test.js --env API_URL="$PRODUCTION_API_URL"
|
|
artifacts:
|
|
reports:
|
|
performance: tests/load/baseline-report.json
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
when: manual
|
|
|
|
cost_analysis:
|
|
image: python:$PYTHON_VERSION
|
|
stage: post-deploy
|
|
before_script:
|
|
- pip install hcloud python-dateutil jinja2
|
|
script:
|
|
- python scripts/cost-analysis.py --environment=production --format=json > cost-report.json
|
|
- python scripts/cost-analysis.py --environment=production --format=markdown > cost-report.md
|
|
artifacts:
|
|
name: cost-analysis-$CI_COMMIT_SHORT_SHA
|
|
paths:
|
|
- cost-report.json
|
|
- cost-report.md
|
|
expire_in: 1 month
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
when: manual
|
|
|
|
# ================================
|
|
# CLEANUP AND UTILITIES
|
|
# ================================
|
|
|
|
destroy_staging:
|
|
<<: *terraform_base
|
|
stage: deploy-staging
|
|
environment:
|
|
name: staging
|
|
action: stop
|
|
script:
|
|
- cd environments/staging
|
|
- terraform destroy -auto-approve
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "web"
|
|
when: manual
|
|
- if: $CI_COMMIT_BRANCH != "main"
|
|
when: manual
|
|
|
|
# ================================
|
|
# SCHEDULED JOBS
|
|
# ================================
|
|
|
|
nightly_full_test:
|
|
extends: .terraform_test
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "nightly"
|
|
parallel:
|
|
matrix:
|
|
- ENVIRONMENT: [staging, production]
|
|
|
|
weekly_security_scan:
|
|
extends: terraform_security_scan
|
|
rules:
|
|
- if: $CI_PIPELINE_SOURCE == "schedule" && $SCHEDULE_TYPE == "weekly"
|
|
|
|
# ================================
|
|
# DEPLOYMENT NOTIFICATIONS
|
|
# ================================
|
|
|
|
notify_deployment_success:
|
|
image: curlimages/curl:latest
|
|
stage: post-deploy
|
|
needs: ["production_smoke_tests"]
|
|
script:
|
|
- |
|
|
if [ -n "$SLACK_WEBHOOK_URL" ]; then
|
|
curl -X POST -H 'Content-type: application/json' \
|
|
--data "{\"text\":\"🚀 Production deployment successful for commit $CI_COMMIT_SHORT_SHA\"}" \
|
|
"$SLACK_WEBHOOK_URL"
|
|
fi
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
when: on_success
|
|
|
|
notify_deployment_failure:
|
|
image: curlimages/curl:latest
|
|
stage: post-deploy
|
|
script:
|
|
- |
|
|
if [ -n "$SLACK_WEBHOOK_URL" ]; then
|
|
curl -X POST -H 'Content-type: application/json' \
|
|
--data "{\"text\":\"❌ Production deployment failed for commit $CI_COMMIT_SHORT_SHA. Check pipeline: $CI_PIPELINE_URL\"}" \
|
|
"$SLACK_WEBHOOK_URL"
|
|
fi
|
|
rules:
|
|
- if: $CI_COMMIT_BRANCH == "main"
|
|
when: on_failure |