diff --git a/.github/workflows/agentic-observability-kit.lock.yml b/.github/workflows/agentic-observability-kit.lock.yml
new file mode 100644
index 0000000000..d5f9bcbfa1
--- /dev/null
+++ b/.github/workflows/agentic-observability-kit.lock.yml
@@ -0,0 +1,1243 @@
+# ___ _ _
+# / _ \ | | (_)
+# | |_| | __ _ ___ _ __ | |_ _ ___
+# | _ |/ _` |/ _ \ '_ \| __| |/ __|
+# | | | | (_| | __/ | | | |_| | (__
+# \_| |_/\__, |\___|_| |_|\__|_|\___|
+# __/ |
+# _ _ |___/
+# | | | | / _| |
+# | | | | ___ _ __ _ __| |_| | _____ ____
+# | |/\| |/ _ \ '__| |/ /| _| |/ _ \ \ /\ / / ___|
+# \ /\ / (_) | | | | ( | | | | (_) \ V V /\__ \
+# \/ \/ \___/|_| |_|\_\|_| |_|\___/ \_/\_/ |___/
+#
+# This file was automatically generated by gh-aw. DO NOT EDIT.
+#
+# To update this file, edit the corresponding .md file and run:
+# gh aw compile
+# Not all edits will cause changes to this file.
+#
+# For more information: https://github.github.com/gh-aw/introduction/overview/
+#
+# Drop-in observability kit for repositories using agentic workflows
+#
+# Resolved workflow manifest:
+# Imports:
+# - shared/reporting.md
+#
+# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"a8b23dcad0059913cb1a28b0793d0e220cf27f9ecd567523892d6c11a3e5868b","strict":true,"agent_id":"copilot"}
+
+name: "Agentic Observability Kit"
+"on":
+ schedule:
+ - cron: "7 8 * * 1"
+ # Friendly format: weekly on monday around 08:00 (scattered)
+ workflow_dispatch:
+ inputs:
+ aw_context:
+ default: ""
+ description: Agent caller context (used internally by Agentic Workflows).
+ required: false
+ type: string
+
+permissions: {}
+
+concurrency:
+ group: "gh-aw-${{ github.workflow }}"
+
+run-name: "Agentic Observability Kit"
+
+jobs:
+ activation:
+ runs-on: ubuntu-slim
+ permissions:
+ contents: read
+ outputs:
+ comment_id: ""
+ comment_repo: ""
+ lockdown_check_failed: ${{ steps.generate_aw_info.outputs.lockdown_check_failed == 'true' }}
+ model: ${{ steps.generate_aw_info.outputs.model }}
+ secret_verification_result: ${{ steps.validate-secret.outputs.verification_result }}
+ steps:
+ - name: Checkout actions folder
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ repository: github/gh-aw
+ sparse-checkout: |
+ actions
+ persist-credentials: false
+ - name: Setup Scripts
+ uses: ./actions/setup
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ - name: Generate agentic run info
+ id: generate_aw_info
+ env:
+ GH_AW_INFO_ENGINE_ID: "copilot"
+ GH_AW_INFO_ENGINE_NAME: "GitHub Copilot CLI"
+ GH_AW_INFO_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || 'auto' }}
+ GH_AW_INFO_VERSION: "latest"
+ GH_AW_INFO_AGENT_VERSION: "latest"
+ GH_AW_INFO_WORKFLOW_NAME: "Agentic Observability Kit"
+ GH_AW_INFO_EXPERIMENTAL: "false"
+ GH_AW_INFO_SUPPORTS_TOOLS_ALLOWLIST: "true"
+ GH_AW_INFO_STAGED: "false"
+ GH_AW_INFO_ALLOWED_DOMAINS: '["defaults"]'
+ GH_AW_INFO_FIREWALL_ENABLED: "true"
+ GH_AW_INFO_AWF_VERSION: "v0.25.0"
+ GH_AW_INFO_AWMG_VERSION: ""
+ GH_AW_INFO_FIREWALL_TYPE: "squid"
+ GH_AW_COMPILED_STRICT: "true"
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_aw_info.cjs');
+ await main(core, context);
+ - name: Validate COPILOT_GITHUB_TOKEN secret
+ id: validate-secret
+ run: ${RUNNER_TEMP}/gh-aw/actions/validate_multi_secret.sh COPILOT_GITHUB_TOKEN 'GitHub Copilot CLI' https://github.github.com/gh-aw/reference/engines/#github-copilot-default
+ env:
+ COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ - name: Checkout .github and .agents folders
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+ sparse-checkout: |
+ .github
+ .agents
+ sparse-checkout-cone-mode: true
+ fetch-depth: 1
+ - name: Check workflow file timestamps
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_WORKFLOW_FILE: "agentic-observability-kit.lock.yml"
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/check_workflow_timestamp_api.cjs');
+ await main();
+ - name: Create prompt with built-in context
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_SAFE_OUTPUTS: ${{ runner.temp }}/gh-aw/safeoutputs/outputs.jsonl
+ GH_AW_GITHUB_ACTOR: ${{ github.actor }}
+ GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }}
+ GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }}
+ GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
+ GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
+ GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+ GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+ GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+ # poutine:ignore untrusted_checkout_exec
+ run: |
+ bash ${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh
+ {
+ cat << 'GH_AW_PROMPT_EOF'
+
+ GH_AW_PROMPT_EOF
+ cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md"
+ cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md"
+ cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md"
+ cat "${RUNNER_TEMP}/gh-aw/prompts/agentic_workflows_guide.md"
+ cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md"
+ cat << 'GH_AW_PROMPT_EOF'
+
+ Tools: create_issue, create_discussion, missing_tool, missing_data, noop
+
+
+ The following GitHub context information is available for this workflow:
+ {{#if __GH_AW_GITHUB_ACTOR__ }}
+ - **actor**: __GH_AW_GITHUB_ACTOR__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_REPOSITORY__ }}
+ - **repository**: __GH_AW_GITHUB_REPOSITORY__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_WORKSPACE__ }}
+ - **workspace**: __GH_AW_GITHUB_WORKSPACE__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ }}
+ - **issue-number**: #__GH_AW_GITHUB_EVENT_ISSUE_NUMBER__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ }}
+ - **discussion-number**: #__GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ }}
+ - **pull-request-number**: #__GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_COMMENT_ID__ }}
+ - **comment-id**: __GH_AW_GITHUB_EVENT_COMMENT_ID__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_RUN_ID__ }}
+ - **workflow-run-id**: __GH_AW_GITHUB_RUN_ID__
+ {{/if}}
+
+
+ GH_AW_PROMPT_EOF
+ cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md"
+ cat << 'GH_AW_PROMPT_EOF'
+
+ GH_AW_PROMPT_EOF
+ cat << 'GH_AW_PROMPT_EOF'
+ {{#runtime-import .github/workflows/shared/reporting.md}}
+ GH_AW_PROMPT_EOF
+ cat << 'GH_AW_PROMPT_EOF'
+ {{#runtime-import .github/workflows/agentic-observability-kit.md}}
+ GH_AW_PROMPT_EOF
+ } > "$GH_AW_PROMPT"
+ - name: Interpolate variables and render templates
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/interpolate_prompt.cjs');
+ await main();
+ - name: Substitute placeholders
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_GITHUB_ACTOR: ${{ github.actor }}
+ GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }}
+ GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }}
+ GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
+ GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
+ GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+ GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+ GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+
+ const substitutePlaceholders = require('${{ runner.temp }}/gh-aw/actions/substitute_placeholders.cjs');
+
+ // Call the substitution function
+ return await substitutePlaceholders({
+ file: process.env.GH_AW_PROMPT,
+ substitutions: {
+ GH_AW_GITHUB_ACTOR: process.env.GH_AW_GITHUB_ACTOR,
+ GH_AW_GITHUB_EVENT_COMMENT_ID: process.env.GH_AW_GITHUB_EVENT_COMMENT_ID,
+ GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: process.env.GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER,
+ GH_AW_GITHUB_EVENT_ISSUE_NUMBER: process.env.GH_AW_GITHUB_EVENT_ISSUE_NUMBER,
+ GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: process.env.GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER,
+ GH_AW_GITHUB_REPOSITORY: process.env.GH_AW_GITHUB_REPOSITORY,
+ GH_AW_GITHUB_RUN_ID: process.env.GH_AW_GITHUB_RUN_ID,
+ GH_AW_GITHUB_WORKSPACE: process.env.GH_AW_GITHUB_WORKSPACE
+ }
+ });
+ - name: Validate prompt placeholders
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ # poutine:ignore untrusted_checkout_exec
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/validate_prompt_placeholders.sh
+ - name: Print prompt
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ # poutine:ignore untrusted_checkout_exec
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/print_prompt_summary.sh
+ - name: Upload activation artifact
+ if: success()
+ uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
+ with:
+ name: activation
+ path: |
+ /tmp/gh-aw/aw_info.json
+ /tmp/gh-aw/aw-prompts/prompt.txt
+ retention-days: 1
+
+ agent:
+ needs: activation
+ runs-on: ubuntu-latest
+ permissions:
+ actions: read
+ contents: read
+ discussions: read
+ issues: read
+ pull-requests: read
+ concurrency:
+ group: "gh-aw-copilot-${{ github.workflow }}"
+ env:
+ DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+ GH_AW_ASSETS_ALLOWED_EXTS: ""
+ GH_AW_ASSETS_BRANCH: ""
+ GH_AW_ASSETS_MAX_SIZE_KB: 0
+ GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs
+ GH_AW_WORKFLOW_ID_SANITIZED: agenticobservabilitykit
+ outputs:
+ checkout_pr_success: ${{ steps.checkout-pr.outputs.checkout_pr_success || 'true' }}
+ has_patch: ${{ steps.collect_output.outputs.has_patch }}
+ inference_access_error: ${{ steps.detect-inference-error.outputs.inference_access_error || 'false' }}
+ model: ${{ needs.activation.outputs.model }}
+ output: ${{ steps.collect_output.outputs.output }}
+ output_types: ${{ steps.collect_output.outputs.output_types }}
+ steps:
+ - name: Checkout actions folder
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ repository: github/gh-aw
+ sparse-checkout: |
+ actions
+ persist-credentials: false
+ - name: Setup Scripts
+ uses: ./actions/setup
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ - name: Set runtime paths
+ id: set-runtime-paths
+ run: |
+ echo "GH_AW_SAFE_OUTPUTS=${RUNNER_TEMP}/gh-aw/safeoutputs/outputs.jsonl" >> "$GITHUB_OUTPUT"
+ echo "GH_AW_SAFE_OUTPUTS_CONFIG_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" >> "$GITHUB_OUTPUT"
+ echo "GH_AW_SAFE_OUTPUTS_TOOLS_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/tools.json" >> "$GITHUB_OUTPUT"
+ - name: Checkout repository
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+ - name: Setup Go for CLI build
+ uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+ with:
+ go-version-file: go.mod
+ cache: true
+ - name: Build gh-aw CLI
+ run: |
+ echo "Building gh-aw CLI for linux/amd64..."
+ mkdir -p dist
+ VERSION=$(git describe --tags --always --dirty)
+ CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
+ -ldflags "-s -w -X main.version=${VERSION}" \
+ -o dist/gh-aw-linux-amd64 \
+ ./cmd/gh-aw
+ # Copy binary to root for direct execution in user-defined steps
+ cp dist/gh-aw-linux-amd64 ./gh-aw
+ chmod +x ./gh-aw
+ echo "✓ Built gh-aw CLI successfully"
+ - name: Setup Docker Buildx
+ uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
+ - name: Build gh-aw Docker image
+ uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+ with:
+ context: .
+ platforms: linux/amd64
+ push: false
+ load: true
+ tags: localhost/gh-aw:dev
+ build-args: |
+ BINARY=dist/gh-aw-linux-amd64
+ - name: Create gh-aw temp directory
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/create_gh_aw_tmp_dir.sh
+ - name: Configure gh CLI for GitHub Enterprise
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/configure_gh_for_ghe.sh
+ env:
+ GH_TOKEN: ${{ github.token }}
+ - name: Configure Git credentials
+ env:
+ REPO_NAME: ${{ github.repository }}
+ SERVER_URL: ${{ github.server_url }}
+ run: |
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
+ git config --global user.name "github-actions[bot]"
+ git config --global am.keepcr true
+ # Re-authenticate git with GitHub token
+ SERVER_URL_STRIPPED="${SERVER_URL#https://}"
+ git remote set-url origin "https://x-access-token:${{ github.token }}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git"
+ echo "Git configured with standard GitHub Actions identity"
+ - name: Checkout PR branch
+ id: checkout-pr
+ if: |
+ github.event.pull_request || github.event.issue.pull_request
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/checkout_pr_branch.cjs');
+ await main();
+ - name: Install GitHub Copilot CLI
+ run: ${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh latest
+ env:
+ GH_HOST: github.com
+ - name: Install AWF binary
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh v0.25.0
+ - name: Determine automatic lockdown mode for GitHub MCP Server
+ id: determine-automatic-lockdown
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }}
+ GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }}
+ with:
+ script: |
+ const determineAutomaticLockdown = require('${{ runner.temp }}/gh-aw/actions/determine_automatic_lockdown.cjs');
+ await determineAutomaticLockdown(github, context, core);
+ - name: Download container images
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh ghcr.io/github/gh-aw-firewall/agent:0.25.0 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.0 ghcr.io/github/gh-aw-firewall/squid:0.25.0 ghcr.io/github/gh-aw-mcpg:v0.2.6 ghcr.io/github/github-mcp-server:v0.32.0 node:lts-alpine
+ - name: Install gh-aw extension
+ env:
+ GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ run: |
+ # Check if gh-aw extension is already installed
+ if gh extension list | grep -q "github/gh-aw"; then
+ echo "gh-aw extension already installed, upgrading..."
+ gh extension upgrade gh-aw || true
+ else
+ echo "Installing gh-aw extension..."
+ gh extension install github/gh-aw
+ fi
+ gh aw --version
+ # Copy the gh-aw binary to ${RUNNER_TEMP}/gh-aw for MCP server containerization
+ mkdir -p ${RUNNER_TEMP}/gh-aw
+ GH_AW_BIN=$(which gh-aw 2>/dev/null || find ~/.local/share/gh/extensions/gh-aw -name 'gh-aw' -type f 2>/dev/null | head -1)
+ if [ -n "$GH_AW_BIN" ] && [ -f "$GH_AW_BIN" ]; then
+ cp "$GH_AW_BIN" ${RUNNER_TEMP}/gh-aw/gh-aw
+ chmod +x ${RUNNER_TEMP}/gh-aw/gh-aw
+ echo "Copied gh-aw binary to ${RUNNER_TEMP}/gh-aw/gh-aw"
+ else
+ echo "::error::Failed to find gh-aw binary for MCP server"
+ exit 1
+ fi
+ - name: Write Safe Outputs Config
+ run: |
+ mkdir -p ${RUNNER_TEMP}/gh-aw/safeoutputs
+ mkdir -p /tmp/gh-aw/safeoutputs
+ mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs
+ cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/config.json << 'GH_AW_SAFE_OUTPUTS_CONFIG_EOF'
+ {"create_discussion":{"category":"audits","close_older_discussions":true,"expires":168,"fallback_to_issue":true,"max":1,"title_prefix":"[observability] "},"create_issue":{"close_older_issues":true,"labels":["agentics","warning","observability"],"max":1,"title_prefix":"[observability escalation] "},"mentions":{"enabled":false},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"false"}}
+ GH_AW_SAFE_OUTPUTS_CONFIG_EOF
+ - name: Write Safe Outputs Tools
+ run: |
+ cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/tools_meta.json << 'GH_AW_SAFE_OUTPUTS_TOOLS_META_EOF'
+ {
+ "description_suffixes": {
+ "create_discussion": " CONSTRAINTS: Maximum 1 discussion(s) can be created. Title will be prefixed with \"[observability] \". Discussions will be created in category \"audits\".",
+ "create_issue": " CONSTRAINTS: Maximum 1 issue(s) can be created. Title will be prefixed with \"[observability escalation] \". Labels [\"agentics\" \"warning\" \"observability\"] will be automatically added."
+ },
+ "repo_params": {},
+ "dynamic_tools": []
+ }
+ GH_AW_SAFE_OUTPUTS_TOOLS_META_EOF
+ cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/validation.json << 'GH_AW_SAFE_OUTPUTS_VALIDATION_EOF'
+ {
+ "create_discussion": {
+ "defaultMax": 1,
+ "fields": {
+ "body": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 65000
+ },
+ "category": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ },
+ "repo": {
+ "type": "string",
+ "maxLength": 256
+ },
+ "title": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ }
+ }
+ },
+ "create_issue": {
+ "defaultMax": 1,
+ "fields": {
+ "body": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 65000
+ },
+ "labels": {
+ "type": "array",
+ "itemType": "string",
+ "itemSanitize": true,
+ "itemMaxLength": 128
+ },
+ "parent": {
+ "issueOrPRNumber": true
+ },
+ "repo": {
+ "type": "string",
+ "maxLength": 256
+ },
+ "temporary_id": {
+ "type": "string"
+ },
+ "title": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ }
+ }
+ },
+ "missing_data": {
+ "defaultMax": 20,
+ "fields": {
+ "alternatives": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ },
+ "context": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ },
+ "data_type": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ },
+ "reason": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ }
+ }
+ },
+ "missing_tool": {
+ "defaultMax": 20,
+ "fields": {
+ "alternatives": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 512
+ },
+ "reason": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ },
+ "tool": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ }
+ }
+ },
+ "noop": {
+ "defaultMax": 1,
+ "fields": {
+ "message": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 65000
+ }
+ }
+ }
+ }
+ GH_AW_SAFE_OUTPUTS_VALIDATION_EOF
+ node ${RUNNER_TEMP}/gh-aw/actions/generate_safe_outputs_tools.cjs
+ - name: Generate Safe Outputs MCP Server Config
+ id: safe-outputs-config
+ run: |
+ # Generate a secure random API key (360 bits of entropy, 40+ chars)
+ # Mask immediately to prevent timing vulnerabilities
+ API_KEY=$(openssl rand -base64 45 | tr -d '/+=')
+ echo "::add-mask::${API_KEY}"
+
+ PORT=3001
+
+ # Set outputs for next steps
+ {
+ echo "safe_outputs_api_key=${API_KEY}"
+ echo "safe_outputs_port=${PORT}"
+ } >> "$GITHUB_OUTPUT"
+
+ echo "Safe Outputs MCP server will run on port ${PORT}"
+
+ - name: Start Safe Outputs MCP HTTP Server
+ id: safe-outputs-start
+ env:
+ DEBUG: '*'
+ GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-config.outputs.safe_outputs_port }}
+ GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-config.outputs.safe_outputs_api_key }}
+ GH_AW_SAFE_OUTPUTS_TOOLS_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/tools.json
+ GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json
+ GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs
+ run: |
+ # Environment variables are set above to prevent template injection
+ export DEBUG
+ export GH_AW_SAFE_OUTPUTS_PORT
+ export GH_AW_SAFE_OUTPUTS_API_KEY
+ export GH_AW_SAFE_OUTPUTS_TOOLS_PATH
+ export GH_AW_SAFE_OUTPUTS_CONFIG_PATH
+ export GH_AW_MCP_LOG_DIR
+
+ bash ${RUNNER_TEMP}/gh-aw/actions/start_safe_outputs_server.sh
+
+ - name: Start MCP Gateway
+ id: start-mcp-gateway
+ env:
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-start.outputs.api_key }}
+ GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-start.outputs.port }}
+ GITHUB_MCP_GUARD_MIN_INTEGRITY: ${{ steps.determine-automatic-lockdown.outputs.min_integrity }}
+ GITHUB_MCP_GUARD_REPOS: ${{ steps.determine-automatic-lockdown.outputs.repos }}
+ GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ set -eo pipefail
+ mkdir -p /tmp/gh-aw/mcp-config
+
+ # Export gateway environment variables for MCP config and gateway script
+ export MCP_GATEWAY_PORT="80"
+ export MCP_GATEWAY_DOMAIN="host.docker.internal"
+ MCP_GATEWAY_API_KEY=$(openssl rand -base64 45 | tr -d '/+=')
+ echo "::add-mask::${MCP_GATEWAY_API_KEY}"
+ export MCP_GATEWAY_API_KEY
+ export MCP_GATEWAY_PAYLOAD_DIR="/tmp/gh-aw/mcp-payloads"
+ mkdir -p "${MCP_GATEWAY_PAYLOAD_DIR}"
+ export MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD="524288"
+ export DEBUG="*"
+
+ export GH_AW_ENGINE="copilot"
+ export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.2.6'
+
+ mkdir -p /home/runner/.copilot
+ cat << GH_AW_MCP_CONFIG_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh
+ {
+ "mcpServers": {
+ "agenticworkflows": {
+ "type": "stdio",
+ "container": "localhost/gh-aw:dev",
+ "mounts": ["\${GITHUB_WORKSPACE}:\${GITHUB_WORKSPACE}:rw", "/tmp/gh-aw:/tmp/gh-aw:rw"],
+ "args": ["--network", "host", "-w", "\${GITHUB_WORKSPACE}"],
+ "env": {
+ "DEBUG": "*",
+ "GITHUB_TOKEN": "\${GITHUB_TOKEN}",
+ "GITHUB_ACTOR": "\${GITHUB_ACTOR}",
+ "GITHUB_REPOSITORY": "\${GITHUB_REPOSITORY}"
+ },
+ "guard-policies": {
+ "write-sink": {
+ "accept": [
+ "*"
+ ]
+ }
+ }
+ },
+ "github": {
+ "type": "stdio",
+ "container": "ghcr.io/github/github-mcp-server:v0.32.0",
+ "env": {
+ "GITHUB_HOST": "\${GITHUB_SERVER_URL}",
+ "GITHUB_PERSONAL_ACCESS_TOKEN": "\${GITHUB_MCP_SERVER_TOKEN}",
+ "GITHUB_READ_ONLY": "1",
+ "GITHUB_TOOLSETS": "context,repos,issues,pull_requests,discussions"
+ },
+ "guard-policies": {
+ "allow-only": {
+ "min-integrity": "$GITHUB_MCP_GUARD_MIN_INTEGRITY",
+ "repos": "$GITHUB_MCP_GUARD_REPOS"
+ }
+ }
+ },
+ "safeoutputs": {
+ "type": "http",
+ "url": "http://host.docker.internal:$GH_AW_SAFE_OUTPUTS_PORT",
+ "headers": {
+ "Authorization": "\${GH_AW_SAFE_OUTPUTS_API_KEY}"
+ },
+ "guard-policies": {
+ "write-sink": {
+ "accept": [
+ "*"
+ ]
+ }
+ }
+ }
+ },
+ "gateway": {
+ "port": $MCP_GATEWAY_PORT,
+ "domain": "${MCP_GATEWAY_DOMAIN}",
+ "apiKey": "${MCP_GATEWAY_API_KEY}",
+ "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}"
+ }
+ }
+ GH_AW_MCP_CONFIG_EOF
+ - name: Download activation artifact
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: activation
+ path: /tmp/gh-aw
+ - name: Clean git credentials
+ continue-on-error: true
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/clean_git_credentials.sh
+ - name: Execute GitHub Copilot CLI
+ id: agentic_execution
+ # Copilot CLI tool arguments (sorted):
+ timeout-minutes: 30
+ run: |
+ set -o pipefail
+ touch /tmp/gh-aw/agent-step-summary.md
+ # shellcheck disable=SC1003
+ sudo -E awf --env-all --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --allow-domains "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com" --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --image-tag 0.25.0 --skip-pull --enable-api-proxy \
+ -- /bin/bash -c '/usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --add-dir "${GITHUB_WORKSPACE}" --disable-builtin-mcps --allow-all-tools --allow-all-paths --prompt "$(cat /tmp/gh-aw/aw-prompts/prompt.txt)"' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log
+ env:
+ COPILOT_AGENT_RUNNER_TYPE: STANDALONE
+ COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ COPILOT_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || '' }}
+ GH_AW_MCP_CONFIG: /home/runner/.copilot/mcp-config.json
+ GH_AW_PHASE: agent
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ GH_AW_VERSION: dev
+ GITHUB_API_URL: ${{ github.api_url }}
+ GITHUB_AW: true
+ GITHUB_HEAD_REF: ${{ github.head_ref }}
+ GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ GITHUB_REF_NAME: ${{ github.ref_name }}
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md
+ GITHUB_WORKSPACE: ${{ github.workspace }}
+ GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_AUTHOR_NAME: github-actions[bot]
+ GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_COMMITTER_NAME: github-actions[bot]
+ XDG_CONFIG_HOME: /home/runner
+ - name: Detect inference access error
+ id: detect-inference-error
+ if: always()
+ continue-on-error: true
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/detect_inference_access_error.sh
+ - name: Configure Git credentials
+ env:
+ REPO_NAME: ${{ github.repository }}
+ SERVER_URL: ${{ github.server_url }}
+ run: |
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
+ git config --global user.name "github-actions[bot]"
+ git config --global am.keepcr true
+ # Re-authenticate git with GitHub token
+ SERVER_URL_STRIPPED="${SERVER_URL#https://}"
+ git remote set-url origin "https://x-access-token:${{ github.token }}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git"
+ echo "Git configured with standard GitHub Actions identity"
+ - name: Copy Copilot session state files to logs
+ if: always()
+ continue-on-error: true
+ run: |
+ # Copy Copilot session state files to logs folder for artifact collection
+ # This ensures they are in /tmp/gh-aw/ where secret redaction can scan them
+ SESSION_STATE_DIR="$HOME/.copilot/session-state"
+ LOGS_DIR="/tmp/gh-aw/sandbox/agent/logs"
+
+ if [ -d "$SESSION_STATE_DIR" ]; then
+ echo "Copying Copilot session state files from $SESSION_STATE_DIR to $LOGS_DIR"
+ mkdir -p "$LOGS_DIR"
+ cp -v "$SESSION_STATE_DIR"/*.jsonl "$LOGS_DIR/" 2>/dev/null || true
+ echo "Session state files copied successfully"
+ else
+ echo "No session-state directory found at $SESSION_STATE_DIR"
+ fi
+ - name: Stop MCP Gateway
+ if: always()
+ continue-on-error: true
+ env:
+ MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }}
+ MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }}
+ GATEWAY_PID: ${{ steps.start-mcp-gateway.outputs.gateway-pid }}
+ run: |
+ bash ${RUNNER_TEMP}/gh-aw/actions/stop_mcp_gateway.sh "$GATEWAY_PID"
+ - name: Redact secrets in logs
+ if: always()
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/redact_secrets.cjs');
+ await main();
+ env:
+ GH_AW_SECRET_NAMES: 'COPILOT_GITHUB_TOKEN,GH_AW_GITHUB_MCP_SERVER_TOKEN,GH_AW_GITHUB_TOKEN,GITHUB_TOKEN'
+ SECRET_COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ SECRET_GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }}
+ SECRET_GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }}
+ SECRET_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ - name: Append agent step summary
+ if: always()
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/append_agent_step_summary.sh
+ - name: Copy Safe Outputs
+ if: always()
+ env:
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ run: |
+ mkdir -p /tmp/gh-aw
+ cp "$GH_AW_SAFE_OUTPUTS" /tmp/gh-aw/safeoutputs.jsonl 2>/dev/null || true
+ - name: Ingest agent output
+ id: collect_output
+ if: always()
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com"
+ GH_AW_ALLOWED_GITHUB_REFS: ""
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_API_URL: ${{ github.api_url }}
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/collect_ndjson_output.cjs');
+ await main();
+ - name: Parse agent logs for step summary
+ if: always()
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_copilot_log.cjs');
+ await main();
+ - name: Parse MCP Gateway logs for step summary
+ if: always()
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_mcp_gateway_log.cjs');
+ await main();
+ - name: Print firewall logs
+ if: always()
+ continue-on-error: true
+ env:
+ AWF_LOGS_DIR: /tmp/gh-aw/sandbox/firewall/logs
+ run: |
+ # Fix permissions on firewall logs so they can be uploaded as artifacts
+ # AWF runs with sudo, creating files owned by root
+ sudo chmod -R a+r /tmp/gh-aw/sandbox/firewall/logs 2>/dev/null || true
+ # Only run awf logs summary if awf command exists (it may not be installed if workflow failed before install step)
+ if command -v awf &> /dev/null; then
+ awf logs summary | tee -a "$GITHUB_STEP_SUMMARY"
+ else
+ echo 'AWF binary not installed, skipping firewall log summary'
+ fi
+ - name: Write agent output placeholder if missing
+ if: always()
+ run: |
+ if [ ! -f /tmp/gh-aw/agent_output.json ]; then
+ echo '{"items":[]}' > /tmp/gh-aw/agent_output.json
+ fi
+ - name: Upload agent artifacts
+ if: always()
+ continue-on-error: true
+ uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
+ with:
+ name: agent
+ path: |
+ /tmp/gh-aw/aw-prompts/prompt.txt
+ /tmp/gh-aw/sandbox/agent/logs/
+ /tmp/gh-aw/redacted-urls.log
+ /tmp/gh-aw/mcp-logs/
+ /tmp/gh-aw/agent-stdio.log
+ /tmp/gh-aw/agent/
+ /tmp/gh-aw/safeoutputs.jsonl
+ /tmp/gh-aw/agent_output.json
+ if-no-files-found: ignore
+ - name: Upload firewall audit logs
+ if: always()
+ continue-on-error: true
+ uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
+ with:
+ name: firewall-audit-logs
+ path: |
+ /tmp/gh-aw/sandbox/firewall/logs/
+ /tmp/gh-aw/sandbox/firewall/audit/
+ if-no-files-found: ignore
+
+ conclusion:
+ needs:
+ - activation
+ - agent
+ - detection
+ - safe_outputs
+ if: always() && (needs.agent.result != 'skipped' || needs.activation.outputs.lockdown_check_failed == 'true')
+ runs-on: ubuntu-slim
+ permissions:
+ contents: read
+ discussions: write
+ issues: write
+ concurrency:
+ group: "gh-aw-conclusion-agentic-observability-kit"
+ cancel-in-progress: false
+ outputs:
+ noop_message: ${{ steps.noop.outputs.noop_message }}
+ tools_reported: ${{ steps.missing_tool.outputs.tools_reported }}
+ total_count: ${{ steps.missing_tool.outputs.total_count }}
+ steps:
+ - name: Checkout actions folder
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ repository: github/gh-aw
+ sparse-checkout: |
+ actions
+ persist-credentials: false
+ - name: Setup Scripts
+ uses: ./actions/setup
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ - name: Download agent output artifact
+ id: download-agent-output
+ continue-on-error: true
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: agent
+ path: /tmp/gh-aw/
+ - name: Setup agent output environment variable
+ id: setup-agent-output-env
+ if: steps.download-agent-output.outcome == 'success'
+ run: |
+ mkdir -p /tmp/gh-aw/
+ find "/tmp/gh-aw/" -type f -print
+ echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+ - name: Process No-Op Messages
+ id: noop
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_NOOP_MAX: "1"
+ GH_AW_WORKFLOW_NAME: "Agentic Observability Kit"
+ GH_AW_TRACKER_ID: "agentic-observability-kit"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/noop.cjs');
+ await main();
+ - name: Record Missing Tool
+ id: missing_tool
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_WORKFLOW_NAME: "Agentic Observability Kit"
+ GH_AW_TRACKER_ID: "agentic-observability-kit"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/missing_tool.cjs');
+ await main();
+ - name: Handle Agent Failure
+ id: handle_agent_failure
+ if: always()
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_WORKFLOW_NAME: "Agentic Observability Kit"
+ GH_AW_TRACKER_ID: "agentic-observability-kit"
+ GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+ GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }}
+ GH_AW_WORKFLOW_ID: "agentic-observability-kit"
+ GH_AW_SECRET_VERIFICATION_RESULT: ${{ needs.activation.outputs.secret_verification_result }}
+ GH_AW_CHECKOUT_PR_SUCCESS: ${{ needs.agent.outputs.checkout_pr_success }}
+ GH_AW_INFERENCE_ACCESS_ERROR: ${{ needs.agent.outputs.inference_access_error }}
+ GH_AW_CREATE_DISCUSSION_ERRORS: ${{ needs.safe_outputs.outputs.create_discussion_errors }}
+ GH_AW_CREATE_DISCUSSION_ERROR_COUNT: ${{ needs.safe_outputs.outputs.create_discussion_error_count }}
+ GH_AW_LOCKDOWN_CHECK_FAILED: ${{ needs.activation.outputs.lockdown_check_failed }}
+ GH_AW_GROUP_REPORTS: "false"
+ GH_AW_FAILURE_REPORT_AS_ISSUE: "true"
+ GH_AW_TIMEOUT_MINUTES: "30"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_agent_failure.cjs');
+ await main();
+ - name: Handle No-Op Message
+ id: handle_noop_message
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_WORKFLOW_NAME: "Agentic Observability Kit"
+ GH_AW_TRACKER_ID: "agentic-observability-kit"
+ GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+ GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }}
+ GH_AW_NOOP_MESSAGE: ${{ steps.noop.outputs.noop_message }}
+ GH_AW_NOOP_REPORT_AS_ISSUE: "false"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_noop_message.cjs');
+ await main();
+
+ detection:
+ needs: agent
+ if: always() && needs.agent.result != 'skipped'
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ outputs:
+ detection_conclusion: ${{ steps.detection_conclusion.outputs.conclusion }}
+ detection_success: ${{ steps.detection_conclusion.outputs.success }}
+ steps:
+ - name: Checkout actions folder
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ repository: github/gh-aw
+ sparse-checkout: |
+ actions
+ persist-credentials: false
+ - name: Setup Scripts
+ uses: ./actions/setup
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ - name: Download agent output artifact
+ id: download-agent-output
+ continue-on-error: true
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: agent
+ path: /tmp/gh-aw/
+ - name: Setup agent output environment variable
+ id: setup-agent-output-env
+ if: steps.download-agent-output.outcome == 'success'
+ run: |
+ mkdir -p /tmp/gh-aw/
+ find "/tmp/gh-aw/" -type f -print
+ echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+ # --- Threat Detection ---
+ - name: Download container images
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh ghcr.io/github/gh-aw-firewall/agent:0.25.0 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.0 ghcr.io/github/gh-aw-firewall/squid:0.25.0
+ - name: Check if detection needed
+ id: detection_guard
+ if: always()
+ env:
+ OUTPUT_TYPES: ${{ needs.agent.outputs.output_types }}
+ HAS_PATCH: ${{ needs.agent.outputs.has_patch }}
+ run: |
+ if [[ -n "$OUTPUT_TYPES" || "$HAS_PATCH" == "true" ]]; then
+ echo "run_detection=true" >> "$GITHUB_OUTPUT"
+ echo "Detection will run: output_types=$OUTPUT_TYPES, has_patch=$HAS_PATCH"
+ else
+ echo "run_detection=false" >> "$GITHUB_OUTPUT"
+ echo "Detection skipped: no agent outputs or patches to analyze"
+ fi
+ - name: Clear MCP configuration for detection
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ run: |
+ rm -f /tmp/gh-aw/mcp-config/mcp-servers.json
+ rm -f /home/runner/.copilot/mcp-config.json
+ rm -f "$GITHUB_WORKSPACE/.gemini/settings.json"
+ - name: Prepare threat detection files
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ run: |
+ mkdir -p /tmp/gh-aw/threat-detection/aw-prompts
+ cp /tmp/gh-aw/aw-prompts/prompt.txt /tmp/gh-aw/threat-detection/aw-prompts/prompt.txt 2>/dev/null || true
+ cp /tmp/gh-aw/agent_output.json /tmp/gh-aw/threat-detection/agent_output.json 2>/dev/null || true
+ for f in /tmp/gh-aw/aw-*.patch; do
+ [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+ done
+ echo "Prepared threat detection files:"
+ ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+ - name: Setup threat detection
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ WORKFLOW_NAME: "Agentic Observability Kit"
+ WORKFLOW_DESCRIPTION: "Drop-in observability kit for repositories using agentic workflows"
+ HAS_PATCH: ${{ needs.agent.outputs.has_patch }}
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/setup_threat_detection.cjs');
+ await main();
+ - name: Ensure threat-detection directory and log
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ run: |
+ mkdir -p /tmp/gh-aw/threat-detection
+ touch /tmp/gh-aw/threat-detection/detection.log
+ - name: Install GitHub Copilot CLI
+ run: ${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh latest
+ env:
+ GH_HOST: github.com
+ - name: Install AWF binary
+ run: bash ${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh v0.25.0
+ - name: Execute GitHub Copilot CLI
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ id: detection_agentic_execution
+ # Copilot CLI tool arguments (sorted):
+ # --allow-tool shell(cat)
+ # --allow-tool shell(grep)
+ # --allow-tool shell(head)
+ # --allow-tool shell(jq)
+ # --allow-tool shell(ls)
+ # --allow-tool shell(tail)
+ # --allow-tool shell(wc)
+ timeout-minutes: 20
+ run: |
+ set -o pipefail
+ touch /tmp/gh-aw/agent-step-summary.md
+ # shellcheck disable=SC1003
+ sudo -E awf --env-all --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --allow-domains "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,github.com,host.docker.internal,raw.githubusercontent.com,registry.npmjs.org,telemetry.enterprise.githubcopilot.com" --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --image-tag 0.25.0 --skip-pull --enable-api-proxy \
+ -- /bin/bash -c '/usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --add-dir "${GITHUB_WORKSPACE}" --disable-builtin-mcps --allow-tool '\''shell(cat)'\'' --allow-tool '\''shell(grep)'\'' --allow-tool '\''shell(head)'\'' --allow-tool '\''shell(jq)'\'' --allow-tool '\''shell(ls)'\'' --allow-tool '\''shell(tail)'\'' --allow-tool '\''shell(wc)'\'' --prompt "$(cat /tmp/gh-aw/aw-prompts/prompt.txt)"' 2>&1 | tee -a /tmp/gh-aw/threat-detection/detection.log
+ env:
+ COPILOT_AGENT_RUNNER_TYPE: STANDALONE
+ COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ COPILOT_MODEL: ${{ vars.GH_AW_MODEL_DETECTION_COPILOT || '' }}
+ GH_AW_PHASE: detection
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_VERSION: dev
+ GITHUB_API_URL: ${{ github.api_url }}
+ GITHUB_AW: true
+ GITHUB_HEAD_REF: ${{ github.head_ref }}
+ GITHUB_REF_NAME: ${{ github.ref_name }}
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md
+ GITHUB_WORKSPACE: ${{ github.workspace }}
+ GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_AUTHOR_NAME: github-actions[bot]
+ GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_COMMITTER_NAME: github-actions[bot]
+ XDG_CONFIG_HOME: /home/runner
+ - name: Parse threat detection results
+ id: parse_detection_results
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_threat_detection_results.cjs');
+ await main();
+ - name: Upload threat detection log
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
+ with:
+ name: detection
+ path: /tmp/gh-aw/threat-detection/detection.log
+ if-no-files-found: ignore
+ - name: Set detection conclusion
+ id: detection_conclusion
+ if: always()
+ env:
+ RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }}
+ DETECTION_SUCCESS: ${{ steps.parse_detection_results.outputs.success }}
+ run: |
+ if [[ "$RUN_DETECTION" != "true" ]]; then
+ echo "conclusion=skipped" >> "$GITHUB_OUTPUT"
+ echo "success=true" >> "$GITHUB_OUTPUT"
+ echo "Detection was not needed, marking as skipped"
+ elif [[ "$DETECTION_SUCCESS" == "true" ]]; then
+ echo "conclusion=success" >> "$GITHUB_OUTPUT"
+ echo "success=true" >> "$GITHUB_OUTPUT"
+ echo "Detection passed successfully"
+ else
+ echo "conclusion=failure" >> "$GITHUB_OUTPUT"
+ echo "success=false" >> "$GITHUB_OUTPUT"
+ echo "Detection found issues"
+ exit 1
+ fi
+
+ safe_outputs:
+ needs:
+ - agent
+ - detection
+ if: (!cancelled()) && needs.agent.result != 'skipped' && needs.detection.result == 'success'
+ runs-on: ubuntu-slim
+ permissions:
+ contents: read
+ discussions: write
+ issues: write
+ concurrency:
+ group: "agentic-observability-kit-safe-outputs"
+ cancel-in-progress: false
+ timeout-minutes: 15
+ env:
+ GH_AW_CALLER_WORKFLOW_ID: "${{ github.repository }}/agentic-observability-kit"
+ GH_AW_ENGINE_ID: "copilot"
+ GH_AW_ENGINE_MODEL: ${{ needs.agent.outputs.model }}
+ GH_AW_TRACKER_ID: "agentic-observability-kit"
+ GH_AW_WORKFLOW_ID: "agentic-observability-kit"
+ GH_AW_WORKFLOW_NAME: "Agentic Observability Kit"
+ outputs:
+ code_push_failure_count: ${{ steps.process_safe_outputs.outputs.code_push_failure_count }}
+ code_push_failure_errors: ${{ steps.process_safe_outputs.outputs.code_push_failure_errors }}
+ create_discussion_error_count: ${{ steps.process_safe_outputs.outputs.create_discussion_error_count }}
+ create_discussion_errors: ${{ steps.process_safe_outputs.outputs.create_discussion_errors }}
+ created_issue_number: ${{ steps.process_safe_outputs.outputs.created_issue_number }}
+ created_issue_url: ${{ steps.process_safe_outputs.outputs.created_issue_url }}
+ process_safe_outputs_processed_count: ${{ steps.process_safe_outputs.outputs.processed_count }}
+ process_safe_outputs_temporary_id_map: ${{ steps.process_safe_outputs.outputs.temporary_id_map }}
+ steps:
+ - name: Checkout actions folder
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ repository: github/gh-aw
+ sparse-checkout: |
+ actions
+ persist-credentials: false
+ - name: Setup Scripts
+ uses: ./actions/setup
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ - name: Download agent output artifact
+ id: download-agent-output
+ continue-on-error: true
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: agent
+ path: /tmp/gh-aw/
+ - name: Setup agent output environment variable
+ id: setup-agent-output-env
+ if: steps.download-agent-output.outcome == 'success'
+ run: |
+ mkdir -p /tmp/gh-aw/
+ find "/tmp/gh-aw/" -type f -print
+ echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+ - name: Configure GH_HOST for enterprise compatibility
+ id: ghes-host-config
+ shell: bash
+ run: |
+ # Derive GH_HOST from GITHUB_SERVER_URL so the gh CLI targets the correct
+ # GitHub instance (GHES/GHEC). On github.com this is a harmless no-op.
+ GH_HOST="${GITHUB_SERVER_URL#https://}"
+ GH_HOST="${GH_HOST#http://}"
+ echo "GH_HOST=${GH_HOST}" >> "$GITHUB_OUTPUT"
+ - name: Process Safe Outputs
+ id: process_safe_outputs
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com"
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_API_URL: ${{ github.api_url }}
+ GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_discussion\":{\"category\":\"audits\",\"close_older_discussions\":true,\"expires\":168,\"fallback_to_issue\":true,\"max\":1,\"title_prefix\":\"[observability] \"},\"create_issue\":{\"close_older_issues\":true,\"labels\":[\"agentics\",\"warning\",\"observability\"],\"max\":1,\"title_prefix\":\"[observability escalation] \"},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"false\"}}"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/safe_output_handler_manager.cjs');
+ await main();
+ - name: Upload Safe Output Items
+ if: always()
+ uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
+ with:
+ name: safe-output-items
+ path: /tmp/gh-aw/safe-output-items.jsonl
+ if-no-files-found: ignore
+
diff --git a/.github/workflows/agentic-observability-kit.md b/.github/workflows/agentic-observability-kit.md
new file mode 100644
index 0000000000..8b4f3408ca
--- /dev/null
+++ b/.github/workflows/agentic-observability-kit.md
@@ -0,0 +1,238 @@
+---
+description: Drop-in observability kit for repositories using agentic workflows
+on:
+ schedule: weekly on monday around 08:00
+ workflow_dispatch:
+permissions:
+ contents: read
+ actions: read
+ issues: read
+ pull-requests: read
+ discussions: read
+engine: copilot
+strict: true
+tracker-id: agentic-observability-kit
+tools:
+ agentic-workflows:
+ github:
+ toolsets: [default, discussions]
+safe-outputs:
+ mentions: false
+ allowed-github-references: []
+ concurrency-group: "agentic-observability-kit-safe-outputs"
+ create-discussion:
+ expires: 7d
+ category: "audits"
+ title-prefix: "[observability] "
+ max: 1
+ close-older-discussions: true
+ create-issue:
+ title-prefix: "[observability escalation] "
+ labels: [agentics, warning, observability]
+ close-older-issues: true
+ max: 1
+ noop:
+ report-as-issue: false
+timeout-minutes: 30
+imports:
+ - shared/reporting.md
+---
+
+# Agentic Observability Kit
+
+You are an agentic workflow observability analyst. Produce one executive report that teams can read quickly, and create at most one escalation issue only when repeated patterns show that repository owners need to take action.
+
+## Mission
+
+Review recent agentic workflow runs and surface the signals that matter operationally:
+
+1. Repeated drift away from a successful baseline
+2. Weak control patterns such as new write posture, new MCP failures, or more blocked requests
+3. Resource-heavy runs that are expensive for the domain they serve
+4. Stable but low-value agentic runs that may be better as deterministic automation
+5. Delegated workflows that lost continuity or are no longer behaving like a consistent cohort
+
+Always create a discussion with the full report. Create an escalation issue only when repeated, actionable problems need durable owner follow-up.
+
+## Data Collection Rules
+
+- Use the `agentic-workflows` MCP tool, not shell commands.
+- Start with the `logs` tool over the last 14 days.
+- Leave `workflow_name` empty so you analyze the full repository.
+- Use `count` large enough to cover the repository, typically `300`.
+- Use the `audit` tool only for up to 3 runs that need deeper inspection.
+- If there are very few runs, still produce a report and explain the limitation.
+
+## Deterministic Episode Model
+
+The logs JSON now includes deterministic lineage fields:
+
+- `episodes[]` for aggregated execution episodes
+- `edges[]` for lineage edges between runs
+
+Treat those structures as the primary source of truth for graph shape, confidence, and episode rollups.
+
+Prefer `episodes[]` and `edges[]` over reconstructing DAGs from raw runs in prompt space. Only fall back to per-run interpretation when episode data is absent or clearly incomplete.
+
+## Signals To Use
+
+The logs JSON already contains the main agentic signals. Prefer these fields over ad hoc heuristics:
+
+- `episodes[].episode_id`
+- `episodes[].kind`
+- `episodes[].confidence`
+- `episodes[].reasons[]`
+- `episodes[].root_run_id`
+- `episodes[].run_ids[]`
+- `episodes[].workflow_names[]`
+- `episodes[].total_runs`
+- `episodes[].total_tokens`
+- `episodes[].total_estimated_cost`
+- `episodes[].total_duration`
+- `episodes[].risky_node_count`
+- `episodes[].write_capable_node_count`
+- `episodes[].mcp_failure_count`
+- `episodes[].blocked_request_count`
+- `episodes[].risk_distribution`
+- `edges[].edge_type`
+- `edges[].confidence`
+- `edges[].reasons[]`
+- `task_domain.name` and `task_domain.label`
+- `behavior_fingerprint.execution_style`
+- `behavior_fingerprint.tool_breadth`
+- `behavior_fingerprint.actuation_style`
+- `behavior_fingerprint.resource_profile`
+- `behavior_fingerprint.dispatch_mode`
+- `agentic_assessments[].kind`
+- `agentic_assessments[].severity`
+- `context.repo`
+- `context.run_id`
+- `context.workflow_id`
+- `context.workflow_call_id`
+- `context.event_type`
+- `comparison.baseline.selection`
+- `comparison.baseline.matched_on[]`
+- `comparison.classification.label`
+- `comparison.classification.reason_codes[]`
+- `comparison.recommendation.action`
+
+Treat these values as the canonical signals for reporting.
+
+## Interpretation Rules
+
+- Use episode-level analysis first. Do not treat connected runs as unrelated when `episodes[]` already groups them.
+- Use per-run detail only to explain which nodes contributed to an episode-level problem.
+- If an episode has low confidence, say so explicitly and avoid overconfident causal claims.
+- If delegated workers look risky in isolation but the enclosing episode looks intentional and well-controlled, say that.
+- If the deterministic episode model appears incomplete or missing expected lineage, report that as an observability finding.
+
+## Reporting Model
+
+The discussion must stay concise and operator-friendly.
+
+### Visible Summary
+
+Keep these sections visible:
+
+1. `### Executive Summary`
+2. `### Key Metrics`
+3. `### Highest Risk Episodes`
+4. `### Recommended Actions`
+
+Include small numeric summaries such as:
+
+- workflows analyzed
+- runs analyzed
+- episodes analyzed
+- high-confidence episodes analyzed
+- runs with `comparison.classification.label == "risky"`
+- runs with medium or high `agentic_assessments`
+- workflows with repeated `overkill_for_agentic`
+- workflows whose comparisons mostly fell back to `latest_success`
+
+### Details
+
+Put detailed per-workflow breakdowns inside `` blocks.
+
+### What Good Reporting Looks Like
+
+For each highlighted episode or workflow, explain:
+
+- what domain it appears to belong to
+- what its behavioral fingerprint looks like
+- whether the deterministic graph shows an orchestrated DAG or delegated episode
+- whether the actor, cost, and risk seem to belong to the workflow itself or to a larger chain
+- what the episode confidence level is and why
+- whether it is stable against a cohort match or only compared to latest success
+- whether the risky behavior is new, repeated, or likely intentional
+- what a team should change next
+
+## Escalation Thresholds
+
+Use the discussion as the complete source of truth for all qualifying workflows and episodes. Only create an escalation issue when one or more episodes or workflows cross these thresholds in the last 14 days:
+
+1. Two or more runs for the same workflow have `comparison.classification.label == "risky"`.
+2. Two or more runs for the same workflow contain `new_mcp_failure` or `blocked_requests_increase` in `comparison.classification.reason_codes`.
+3. Two or more runs for the same workflow contain a medium or high severity `resource_heavy_for_domain` assessment.
+4. Two or more runs for the same workflow contain a medium or high severity `poor_agentic_control` assessment.
+
+Do not open one issue per workflow. Create at most one escalation issue for the whole run.
+
+If no workflow crosses these thresholds, do not create an escalation issue.
+
+If one or more workflows do cross these thresholds, create a single escalation issue that groups the highest-value follow-up work for repository owners. The escalation issue should summarize the workflows that need attention now, why they crossed the thresholds, and what change is recommended first.
+
+Prefer escalating at the episode level when multiple risky runs are part of one coherent DAG. Only fall back to workflow-level escalation when no broader episode can be established with acceptable confidence.
+
+## Optimization Candidates
+
+Do not create issues for these by default. Report them in the discussion unless they are severe and repeated:
+
+- repeated `overkill_for_agentic`
+- workflows that are consistently `lean`, `directed`, and `narrow`
+- workflows that are always compared using `latest_success` instead of `cohort_match`
+
+These are portfolio cleanup opportunities, not immediate incidents.
+
+## Use Of Audit
+
+Use `audit` only when the logs summary is not enough to explain a top problem. Good audit candidates are:
+
+- the newest risky run for a workflow with repeated warnings
+- a run with a new MCP failure
+- a run that changed from read-only to write-capable posture
+
+When you use `audit`, fold the extra evidence back into the report instead of dumping raw output.
+
+## Output Requirements
+
+### Discussion
+
+Always create one discussion that includes:
+
+- the date range analyzed
+- any important orchestrator, worker, or workflow_run chains that materially change interpretation
+- the most important inferred episodes and their confidence levels
+- all workflows that crossed the escalation thresholds
+- the workflows with the clearest repeated risk
+- the most common assessment kinds
+- a short list of deterministic candidates
+- a short list of workflows that need owner attention now
+
+The discussion should cover all qualifying workflows even when no escalation issue is created.
+
+### Issues
+
+Only create an escalation issue when at least one workflow crossed the escalation thresholds. When you do:
+
+- create one issue for the whole run, not one issue per workflow
+- use a concrete title that signals repository-level owner attention is needed
+- group the escalated workflows in priority order
+- explain the evidence with run counts and the specific assessment or comparison reason codes
+- include the most relevant recommendation for each escalated workflow
+- link up to 3 representative runs across the highest-priority workflows
+- make the issue concise enough to function as a backlog item, with the full detail living in the discussion
+
+### No-op
+
+If the repository has no recent runs or no report can be produced, call `noop` with a short explanation. Otherwise do not use `noop`.
diff --git a/actions/setup/js/generate_observability_summary.cjs b/actions/setup/js/generate_observability_summary.cjs
new file mode 100644
index 0000000000..0f28bb12a4
--- /dev/null
+++ b/actions/setup/js/generate_observability_summary.cjs
@@ -0,0 +1,133 @@
+// @ts-check
+///
+
+const fs = require("fs");
+
+const AW_INFO_PATH = "/tmp/gh-aw/aw_info.json";
+const AGENT_OUTPUT_PATH = "/tmp/gh-aw/agent_output.json";
+const gatewayEventPaths = ["/tmp/gh-aw/mcp-logs/gateway.jsonl", "/tmp/gh-aw/mcp-logs/rpc-messages.jsonl"];
+
+function readJSONIfExists(path) {
+ if (!fs.existsSync(path)) {
+ return null;
+ }
+
+ try {
+ return JSON.parse(fs.readFileSync(path, "utf8"));
+ } catch {
+ return null;
+ }
+}
+
+function countBlockedRequests() {
+ let total = 0;
+
+ for (const path of gatewayEventPaths) {
+ if (!fs.existsSync(path)) {
+ continue;
+ }
+
+ const lines = fs.readFileSync(path, "utf8").split("\n");
+ for (const raw of lines) {
+ const line = raw.trim();
+ if (!line) continue;
+ try {
+ const entry = JSON.parse(line);
+ if (entry && entry.type === "DIFC_FILTERED") total++;
+ } catch {
+ // skip malformed lines
+ }
+ }
+ }
+
+ return total;
+}
+
+function uniqueCreatedItemTypes(items) {
+ const types = new Set();
+
+ for (const item of items) {
+ if (item && typeof item.type === "string" && item.type.trim() !== "") {
+ types.add(item.type);
+ }
+ }
+
+ return [...types].sort();
+}
+
+function collectObservabilityData() {
+ const awInfo = readJSONIfExists(AW_INFO_PATH) || {};
+ const agentOutput = readJSONIfExists(AGENT_OUTPUT_PATH) || { items: [], errors: [] };
+ const items = Array.isArray(agentOutput.items) ? agentOutput.items : [];
+ const errors = Array.isArray(agentOutput.errors) ? agentOutput.errors : [];
+ const traceId = awInfo.context && typeof awInfo.context.workflow_call_id === "string" ? awInfo.context.workflow_call_id : "";
+
+ return {
+ workflowName: awInfo.workflow_name || "",
+ engineId: awInfo.engine_id || "",
+ traceId,
+ staged: awInfo.staged === true,
+ firewallEnabled: awInfo.firewall_enabled === true,
+ createdItemCount: items.length,
+ createdItemTypes: uniqueCreatedItemTypes(items),
+ outputErrorCount: errors.length,
+ blockedRequests: countBlockedRequests(),
+ };
+}
+
+function buildObservabilitySummary(data) {
+ const posture = data.createdItemCount > 0 ? "write-capable" : "read-only";
+ const lines = [];
+
+ lines.push("");
+ lines.push("Observability
");
+ lines.push("");
+
+ if (data.workflowName) {
+ lines.push(`- **workflow**: ${data.workflowName}`);
+ }
+ if (data.engineId) {
+ lines.push(`- **engine**: ${data.engineId}`);
+ }
+ if (data.traceId) {
+ lines.push(`- **trace id**: ${data.traceId}`);
+ }
+
+ lines.push(`- **posture**: ${posture}`);
+ lines.push(`- **created items**: ${data.createdItemCount}`);
+ lines.push(`- **blocked requests**: ${data.blockedRequests}`);
+ lines.push(`- **agent output errors**: ${data.outputErrorCount}`);
+ lines.push(`- **firewall enabled**: ${data.firewallEnabled}`);
+ lines.push(`- **staged**: ${data.staged}`);
+
+ if (data.createdItemTypes.length > 0) {
+ lines.push("- **item types**:");
+ for (const itemType of data.createdItemTypes) {
+ lines.push(` - ${itemType}`);
+ }
+ }
+
+ lines.push("");
+ lines.push(" ");
+
+ return lines.join("\n") + "\n";
+}
+
+async function main(core) {
+ const mode = process.env.GH_AW_OBSERVABILITY_JOB_SUMMARY || "";
+ if (mode !== "on") {
+ core.info(`Skipping observability summary: mode=${mode || "unset"}`);
+ return;
+ }
+
+ const data = collectObservabilityData();
+ const markdown = buildObservabilitySummary(data);
+ await core.summary.addRaw(markdown).write();
+ core.info("Generated observability summary in step summary");
+}
+
+module.exports = {
+ buildObservabilitySummary,
+ collectObservabilityData,
+ main,
+};
diff --git a/actions/setup/js/generate_observability_summary.test.cjs b/actions/setup/js/generate_observability_summary.test.cjs
new file mode 100644
index 0000000000..56d7a06a8c
--- /dev/null
+++ b/actions/setup/js/generate_observability_summary.test.cjs
@@ -0,0 +1,79 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import fs from "fs";
+
+const mockCore = {
+ info: vi.fn(),
+ summary: {
+ addRaw: vi.fn().mockReturnThis(),
+ write: vi.fn().mockResolvedValue(),
+ },
+};
+
+global.core = mockCore;
+
+describe("generate_observability_summary.cjs", () => {
+ let module;
+
+ beforeEach(async () => {
+ vi.clearAllMocks();
+ fs.mkdirSync("/tmp/gh-aw/mcp-logs", { recursive: true });
+ process.env.GH_AW_OBSERVABILITY_JOB_SUMMARY = "on";
+ module = await import("./generate_observability_summary.cjs");
+ });
+
+ afterEach(() => {
+ delete process.env.GH_AW_OBSERVABILITY_JOB_SUMMARY;
+ for (const path of ["/tmp/gh-aw/aw_info.json", "/tmp/gh-aw/agent_output.json", "/tmp/gh-aw/mcp-logs/gateway.jsonl", "/tmp/gh-aw/mcp-logs/rpc-messages.jsonl"]) {
+ if (fs.existsSync(path)) {
+ fs.unlinkSync(path);
+ }
+ }
+ });
+
+ it("builds summary from runtime observability files", async () => {
+ fs.writeFileSync(
+ "/tmp/gh-aw/aw_info.json",
+ JSON.stringify({
+ workflow_name: "triage-workflow",
+ engine_id: "copilot",
+ staged: false,
+ firewall_enabled: true,
+ context: { workflow_call_id: "trace-123" },
+ })
+ );
+ fs.writeFileSync(
+ "/tmp/gh-aw/agent_output.json",
+ JSON.stringify({
+ items: [{ type: "create_issue" }, { type: "add_comment" }],
+ errors: ["validation failed"],
+ })
+ );
+ fs.writeFileSync("/tmp/gh-aw/mcp-logs/gateway.jsonl", [JSON.stringify({ type: "DIFC_FILTERED" }), JSON.stringify({ type: "REQUEST" })].join("\n"));
+
+ await module.main(mockCore);
+
+ expect(mockCore.summary.addRaw).toHaveBeenCalledTimes(1);
+ const summary = mockCore.summary.addRaw.mock.calls[0][0];
+ expect(summary).toContain("Observability
");
+ expect(summary).toContain("- **workflow**: triage-workflow");
+ expect(summary).toContain("- **engine**: copilot");
+ expect(summary).toContain("- **trace id**: trace-123");
+ expect(summary).toContain("- **posture**: write-capable");
+ expect(summary).toContain("- **created items**: 2");
+ expect(summary).toContain("- **blocked requests**: 1");
+ expect(summary).toContain("- **agent output errors**: 1");
+ expect(summary).toContain(" - add_comment");
+ expect(summary).toContain(" - create_issue");
+ expect(mockCore.summary.write).toHaveBeenCalledTimes(1);
+ });
+
+ it("skips summary generation when opt-in mode is disabled", async () => {
+ process.env.GH_AW_OBSERVABILITY_JOB_SUMMARY = "off";
+
+ await module.main(mockCore);
+
+ expect(mockCore.summary.addRaw).not.toHaveBeenCalled();
+ expect(mockCore.summary.write).not.toHaveBeenCalled();
+ expect(mockCore.info).toHaveBeenCalledWith("Skipping observability summary: mode=off");
+ });
+});
diff --git a/pkg/cli/audit.go b/pkg/cli/audit.go
index eab09877d7..51f2b18514 100644
--- a/pkg/cli/audit.go
+++ b/pkg/cli/audit.go
@@ -346,6 +346,9 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("Failed to list artifacts: %v", err)))
}
+ currentCreatedItems := extractCreatedItemsFromManifest(runOutputDir)
+ run.SafeItemsCount = len(currentCreatedItems)
+
// Create processed run for report generation
processedRun := ProcessedRun{
Run: run,
@@ -358,9 +361,18 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
MCPFailures: mcpFailures,
JobDetails: jobDetails,
}
+ awContext, _, _, taskDomain, behaviorFingerprint, agenticAssessments := deriveRunAgenticAnalysis(processedRun, metrics)
+ processedRun.AwContext = awContext
+ processedRun.TaskDomain = taskDomain
+ processedRun.BehaviorFingerprint = behaviorFingerprint
+ processedRun.AgenticAssessments = agenticAssessments
+
+ currentSnapshot := buildAuditComparisonSnapshot(processedRun, currentCreatedItems)
+ comparison := buildAuditComparisonForRun(processedRun, currentSnapshot, runOutputDir, owner, repo, hostname, verbose)
// Build structured audit data
auditData := buildAuditData(processedRun, metrics, mcpToolUsage)
+ auditData.Comparison = comparison
// Render output based on format preference
if jsonOutput {
@@ -420,6 +432,10 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
ProcessedAt: time.Now(),
Run: run,
Metrics: metrics,
+ AwContext: processedRun.AwContext,
+ TaskDomain: processedRun.TaskDomain,
+ BehaviorFingerprint: processedRun.BehaviorFingerprint,
+ AgenticAssessments: processedRun.AgenticAssessments,
AccessAnalysis: accessAnalysis,
FirewallAnalysis: firewallAnalysis,
PolicyAnalysis: policyAnalysis,
diff --git a/pkg/cli/audit_agentic_analysis.go b/pkg/cli/audit_agentic_analysis.go
new file mode 100644
index 0000000000..9674a2120f
--- /dev/null
+++ b/pkg/cli/audit_agentic_analysis.go
@@ -0,0 +1,346 @@
+package cli
+
+import (
+ "fmt"
+ "path/filepath"
+ "slices"
+ "strings"
+ "time"
+
+ "github.com/github/gh-aw/pkg/timeutil"
+ "github.com/github/gh-aw/pkg/workflow"
+)
+
+// TaskDomainInfo describes the dominant task type inferred for a workflow run.
+type TaskDomainInfo struct {
+ Name string `json:"name"`
+ Label string `json:"label"`
+ Reason string `json:"reason,omitempty"`
+}
+
+// BehaviorFingerprint summarizes the run's execution profile in compact dimensions.
+type BehaviorFingerprint struct {
+ ExecutionStyle string `json:"execution_style"`
+ ToolBreadth string `json:"tool_breadth"`
+ ActuationStyle string `json:"actuation_style"`
+ ResourceProfile string `json:"resource_profile"`
+ DispatchMode string `json:"dispatch_mode"`
+}
+
+// AgenticAssessment captures an actionable judgment about the run's behavior.
+type AgenticAssessment struct {
+ Kind string `json:"kind"`
+ Severity string `json:"severity"`
+ Summary string `json:"summary"`
+ Evidence string `json:"evidence,omitempty"`
+ Recommendation string `json:"recommendation,omitempty"`
+}
+
+func buildToolUsageInfo(metrics LogMetrics) []ToolUsageInfo {
+ toolStats := make(map[string]*ToolUsageInfo)
+
+ for _, toolCall := range metrics.ToolCalls {
+ displayKey := workflow.PrettifyToolName(toolCall.Name)
+ if existing, exists := toolStats[displayKey]; exists {
+ existing.CallCount += toolCall.CallCount
+ if toolCall.MaxInputSize > existing.MaxInputSize {
+ existing.MaxInputSize = toolCall.MaxInputSize
+ }
+ if toolCall.MaxOutputSize > existing.MaxOutputSize {
+ existing.MaxOutputSize = toolCall.MaxOutputSize
+ }
+ if toolCall.MaxDuration > 0 {
+ maxDuration := timeutil.FormatDuration(toolCall.MaxDuration)
+ if existing.MaxDuration == "" || toolCall.MaxDuration > parseDurationString(existing.MaxDuration) {
+ existing.MaxDuration = maxDuration
+ }
+ }
+ continue
+ }
+
+ info := &ToolUsageInfo{
+ Name: displayKey,
+ CallCount: toolCall.CallCount,
+ MaxInputSize: toolCall.MaxInputSize,
+ MaxOutputSize: toolCall.MaxOutputSize,
+ }
+ if toolCall.MaxDuration > 0 {
+ info.MaxDuration = timeutil.FormatDuration(toolCall.MaxDuration)
+ }
+ toolStats[displayKey] = info
+ }
+
+ toolUsage := make([]ToolUsageInfo, 0, len(toolStats))
+ for _, info := range toolStats {
+ toolUsage = append(toolUsage, *info)
+ }
+
+ slices.SortFunc(toolUsage, func(a, b ToolUsageInfo) int {
+ if a.CallCount != b.CallCount {
+ return b.CallCount - a.CallCount
+ }
+ return strings.Compare(a.Name, b.Name)
+ })
+
+ return toolUsage
+}
+
+func deriveRunAgenticAnalysis(processedRun ProcessedRun, metrics LogMetrics) (*AwContext, []ToolUsageInfo, []CreatedItemReport, *TaskDomainInfo, *BehaviorFingerprint, []AgenticAssessment) {
+ var awContext *AwContext
+ if processedRun.AwContext != nil {
+ awContext = processedRun.AwContext
+ } else if processedRun.Run.LogsPath != "" {
+ awInfoPath := filepath.Join(processedRun.Run.LogsPath, "aw_info.json")
+ if info, err := parseAwInfo(awInfoPath, false); err == nil && info != nil {
+ awContext = info.Context
+ }
+ }
+
+ toolUsage := buildToolUsageInfo(metrics)
+ createdItems := extractCreatedItemsFromManifest(processedRun.Run.LogsPath)
+ metricsData := MetricsData{
+ TokenUsage: processedRun.Run.TokenUsage,
+ EstimatedCost: processedRun.Run.EstimatedCost,
+ Turns: processedRun.Run.Turns,
+ ErrorCount: processedRun.Run.ErrorCount,
+ WarningCount: processedRun.Run.WarningCount,
+ }
+
+ taskDomain := detectTaskDomain(processedRun, createdItems, toolUsage, awContext)
+ behaviorFingerprint := buildBehaviorFingerprint(processedRun, metricsData, toolUsage, createdItems, awContext)
+ agenticAssessments := buildAgenticAssessments(processedRun, metricsData, toolUsage, createdItems, taskDomain, behaviorFingerprint, awContext)
+
+ return awContext, toolUsage, createdItems, taskDomain, behaviorFingerprint, agenticAssessments
+}
+
+func detectTaskDomain(processedRun ProcessedRun, createdItems []CreatedItemReport, toolUsage []ToolUsageInfo, awContext *AwContext) *TaskDomainInfo {
+ combined := strings.ToLower(strings.Join([]string{
+ processedRun.Run.WorkflowName,
+ processedRun.Run.WorkflowPath,
+ processedRun.Run.Event,
+ }, " "))
+
+ createdTypes := make([]string, 0, len(createdItems))
+ for _, item := range createdItems {
+ createdTypes = append(createdTypes, strings.ToLower(item.Type))
+ }
+ createdJoined := strings.Join(createdTypes, " ")
+
+ toolNames := make([]string, 0, len(toolUsage))
+ for _, tool := range toolUsage {
+ toolNames = append(toolNames, strings.ToLower(tool.Name))
+ }
+ toolJoined := strings.Join(toolNames, " ")
+
+ switch {
+ case containsAny(combined, "release", "deploy", "publish", "backport", "changelog"):
+ return &TaskDomainInfo{Name: "release_ops", Label: "Release / Ops", Reason: "Workflow metadata matches release or operational automation."}
+ case containsAny(combined, "research", "investigat", "analysis", "analy", "report", "audit"):
+ return &TaskDomainInfo{Name: "research", Label: "Research", Reason: "Workflow naming and instructions suggest exploratory analysis or reporting."}
+ case containsAny(combined, "triage", "label", "classif", "route") || containsAny(createdJoined, "add_labels", "remove_labels", "set_issue_type"):
+ return &TaskDomainInfo{Name: "triage", Label: "Triage", Reason: "The run focused on classification, routing, or issue state updates."}
+ case containsAny(combined, "fix", "patch", "repair", "refactor", "swe", "code", "review") || containsAny(createdJoined, "create_pull_request_review_comment", "submit_pull_request_review"):
+ return &TaskDomainInfo{Name: "code_fix", Label: "Code Fix", Reason: "The workflow appears oriented toward code changes or pull request review."}
+ case containsAny(combined, "cleanup", "maint", "update", "deps", "sync", "housekeeping"):
+ return &TaskDomainInfo{Name: "repo_maintenance", Label: "Repo Maintenance", Reason: "Workflow metadata matches repository maintenance or update work."}
+ case containsAny(combined, "issue", "discussion", "comment", "support", "reply") || containsAny(createdJoined, "add_comment", "create_discussion"):
+ return &TaskDomainInfo{Name: "issue_response", Label: "Issue Response", Reason: "The run is primarily interacting with issue, discussion, or comment threads."}
+ case awContext != nil:
+ return &TaskDomainInfo{Name: "delegated_automation", Label: "Delegated Automation", Reason: "The run was dispatched from an upstream workflow and is acting as a delegated task."}
+ case containsAny(toolJoined, "github_issue_read", "github-discussion-query"):
+ return &TaskDomainInfo{Name: "issue_response", Label: "Issue Response", Reason: "Tool usage centers on repository conversations and issue context."}
+ default:
+ return &TaskDomainInfo{Name: "general_automation", Label: "General Automation", Reason: "The run does not strongly match a narrower workflow domain yet."}
+ }
+}
+
+func buildBehaviorFingerprint(processedRun ProcessedRun, metrics MetricsData, toolUsage []ToolUsageInfo, createdItems []CreatedItemReport, awContext *AwContext) *BehaviorFingerprint {
+ toolTypes := len(toolUsage)
+ writeCount := len(createdItems) + processedRun.Run.SafeItemsCount
+
+ executionStyle := "directed"
+ switch {
+ case metrics.Turns >= 10 || toolTypes >= 6:
+ executionStyle = "exploratory"
+ case metrics.Turns >= 5 || toolTypes >= 4:
+ executionStyle = "adaptive"
+ }
+
+ toolBreadth := "narrow"
+ switch {
+ case toolTypes >= 6:
+ toolBreadth = "broad"
+ case toolTypes >= 3:
+ toolBreadth = "moderate"
+ }
+
+ actuationStyle := "read_only"
+ switch {
+ case writeCount >= 6:
+ actuationStyle = "write_heavy"
+ case writeCount > 0:
+ actuationStyle = "selective_write"
+ }
+
+ resourceProfile := "lean"
+ switch {
+ case processedRun.Run.Duration >= 15*time.Minute || metrics.Turns >= 12 || toolTypes >= 6 || writeCount >= 8:
+ resourceProfile = "heavy"
+ case processedRun.Run.Duration >= 5*time.Minute || metrics.Turns >= 6 || toolTypes >= 4 || writeCount >= 3:
+ resourceProfile = "moderate"
+ }
+
+ dispatchMode := "standalone"
+ if awContext != nil {
+ dispatchMode = "delegated"
+ }
+
+ return &BehaviorFingerprint{
+ ExecutionStyle: executionStyle,
+ ToolBreadth: toolBreadth,
+ ActuationStyle: actuationStyle,
+ ResourceProfile: resourceProfile,
+ DispatchMode: dispatchMode,
+ }
+}
+
+func buildAgenticAssessments(processedRun ProcessedRun, metrics MetricsData, toolUsage []ToolUsageInfo, createdItems []CreatedItemReport, domain *TaskDomainInfo, fingerprint *BehaviorFingerprint, awContext *AwContext) []AgenticAssessment {
+ if domain == nil || fingerprint == nil {
+ return nil
+ }
+
+ assessments := make([]AgenticAssessment, 0, 4)
+ toolTypes := len(toolUsage)
+ frictionEvents := len(processedRun.MissingTools) + len(processedRun.MCPFailures) + len(processedRun.MissingData)
+ writeCount := len(createdItems) + processedRun.Run.SafeItemsCount
+
+ if fingerprint.ResourceProfile == "heavy" {
+ severity := "medium"
+ if metrics.Turns >= 14 || toolTypes >= 7 || processedRun.Run.Duration >= 20*time.Minute {
+ severity = "high"
+ }
+ assessments = append(assessments, AgenticAssessment{
+ Kind: "resource_heavy_for_domain",
+ Severity: severity,
+ Summary: fmt.Sprintf("This %s run consumed a heavy execution profile for its task shape.", domain.Label),
+ Evidence: fmt.Sprintf("turns=%d tool_types=%d duration=%s write_actions=%d", metrics.Turns, toolTypes, formatAssessmentDuration(processedRun.Run.Duration), writeCount),
+ Recommendation: "Compare this run to similar successful runs and trim unnecessary turns, tools, or write actions.",
+ })
+ }
+
+ if (domain.Name == "triage" || domain.Name == "repo_maintenance" || domain.Name == "issue_response") && fingerprint.ResourceProfile == "lean" && fingerprint.ExecutionStyle == "directed" && fingerprint.ToolBreadth == "narrow" {
+ assessments = append(assessments, AgenticAssessment{
+ Kind: "overkill_for_agentic",
+ Severity: "low",
+ Summary: fmt.Sprintf("This %s run looks stable enough that deterministic automation may be a simpler fit.", domain.Label),
+ Evidence: fmt.Sprintf("turns=%d tool_types=%d actuation=%s", metrics.Turns, toolTypes, fingerprint.ActuationStyle),
+ Recommendation: "Consider whether a scripted rule or deterministic workflow step could replace this agentic path.",
+ })
+ }
+
+ if frictionEvents >= 3 || (frictionEvents > 0 && writeCount >= 3) || ((domain.Name == "triage" || domain.Name == "repo_maintenance" || domain.Name == "issue_response") && fingerprint.ExecutionStyle == "exploratory") {
+ severity := "medium"
+ if frictionEvents >= 4 || (frictionEvents > 0 && fingerprint.ActuationStyle == "write_heavy") {
+ severity = "high"
+ }
+ assessments = append(assessments, AgenticAssessment{
+ Kind: "poor_agentic_control",
+ Severity: severity,
+ Summary: "The run showed signs of broad or weakly controlled agentic behavior.",
+ Evidence: fmt.Sprintf("friction=%d execution=%s actuation=%s", frictionEvents, fingerprint.ExecutionStyle, fingerprint.ActuationStyle),
+ Recommendation: "Tighten instructions, reduce unnecessary tools, or delay write actions until the workflow has stronger evidence.",
+ })
+ }
+
+ if awContext != nil {
+ assessments = append(assessments, AgenticAssessment{
+ Kind: "delegated_context_present",
+ Severity: "info",
+ Summary: "The run preserved upstream dispatch context, which helps trace multi-workflow episodes.",
+ Evidence: fmt.Sprintf("workflow_call_id=%s event_type=%s", awContext.WorkflowCallID, awContext.EventType),
+ Recommendation: "Use this context when comparing downstream runs so follow-up workflows are evaluated as part of one task chain.",
+ })
+ }
+
+ return assessments
+}
+
+func generateAgenticAssessmentFindings(assessments []AgenticAssessment) []Finding {
+ findings := make([]Finding, 0, len(assessments))
+ for _, assessment := range assessments {
+ category := "agentic"
+ impact := "Review recommended"
+ switch assessment.Kind {
+ case "resource_heavy_for_domain":
+ category = "performance"
+ impact = "Higher cost and latency than a comparable well-behaved run"
+ case "overkill_for_agentic":
+ category = "optimization"
+ impact = "A deterministic implementation may be cheaper and easier to govern"
+ case "poor_agentic_control":
+ category = "agentic"
+ impact = "Broad or weakly controlled behavior can reduce trust even when the run succeeds"
+ case "delegated_context_present":
+ category = "coordination"
+ impact = "Context continuity improves downstream debugging and auditability"
+ }
+ findings = append(findings, Finding{
+ Category: category,
+ Severity: assessment.Severity,
+ Title: prettifyAssessmentKind(assessment.Kind),
+ Description: assessment.Summary,
+ Impact: impact,
+ })
+ }
+ return findings
+}
+
+func generateAgenticAssessmentRecommendations(assessments []AgenticAssessment) []Recommendation {
+ recommendations := make([]Recommendation, 0, len(assessments))
+ for _, assessment := range assessments {
+ if assessment.Recommendation == "" || assessment.Severity == "info" {
+ continue
+ }
+ priority := "medium"
+ if assessment.Severity == "high" {
+ priority = "high"
+ }
+ recommendations = append(recommendations, Recommendation{
+ Priority: priority,
+ Action: assessment.Recommendation,
+ Reason: assessment.Summary,
+ })
+ }
+ return recommendations
+}
+
+func containsAny(value string, terms ...string) bool {
+ for _, term := range terms {
+ if strings.Contains(value, term) {
+ return true
+ }
+ }
+ return false
+}
+
+func prettifyAssessmentKind(kind string) string {
+ switch kind {
+ case "resource_heavy_for_domain":
+ return "Resource Heavy For Domain"
+ case "overkill_for_agentic":
+ return "Potential Deterministic Alternative"
+ case "poor_agentic_control":
+ return "Weak Agentic Control"
+ case "delegated_context_present":
+ return "Dispatch Context Preserved"
+ default:
+ return strings.ReplaceAll(kind, "_", " ")
+ }
+}
+
+func formatAssessmentDuration(duration time.Duration) string {
+ if duration <= 0 {
+ return "n/a"
+ }
+ return duration.String()
+}
diff --git a/pkg/cli/audit_agentic_analysis_test.go b/pkg/cli/audit_agentic_analysis_test.go
new file mode 100644
index 0000000000..96fa46fd7d
--- /dev/null
+++ b/pkg/cli/audit_agentic_analysis_test.go
@@ -0,0 +1,107 @@
+//go:build !integration
+
+package cli
+
+import (
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestDetectTaskDomain(t *testing.T) {
+ processedRun := ProcessedRun{
+ Run: WorkflowRun{
+ WorkflowName: "Weekly Research Report",
+ WorkflowPath: ".github/workflows/weekly-research.yml",
+ Event: "schedule",
+ },
+ }
+
+ domain := detectTaskDomain(processedRun, nil, nil, nil)
+ require.NotNil(t, domain, "domain should be detected")
+ assert.Equal(t, "research", domain.Name)
+ assert.Equal(t, "Research", domain.Label)
+}
+
+func TestBuildAgenticAssessmentsFlagsPotentialDeterministicAlternative(t *testing.T) {
+ processedRun := ProcessedRun{
+ Run: WorkflowRun{
+ WorkflowName: "Issue Triage",
+ Turns: 2,
+ Duration: 2 * time.Minute,
+ },
+ }
+ metrics := MetricsData{Turns: 2}
+ toolUsage := []ToolUsageInfo{{Name: "github_issue_read", CallCount: 1}}
+ domain := &TaskDomainInfo{Name: "triage", Label: "Triage"}
+ fingerprint := &BehaviorFingerprint{
+ ExecutionStyle: "directed",
+ ToolBreadth: "narrow",
+ ActuationStyle: "read_only",
+ ResourceProfile: "lean",
+ DispatchMode: "standalone",
+ }
+
+ assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, nil, domain, fingerprint, nil)
+ require.NotEmpty(t, assessments)
+ assert.Equal(t, "overkill_for_agentic", assessments[0].Kind)
+}
+
+func TestBuildAgenticAssessmentsFlagsResourceHeavyRun(t *testing.T) {
+ processedRun := ProcessedRun{
+ Run: WorkflowRun{
+ WorkflowName: "Deep Research",
+ Turns: 15,
+ Duration: 22 * time.Minute,
+ SafeItemsCount: 4,
+ },
+ }
+ metrics := MetricsData{Turns: 15}
+ toolUsage := []ToolUsageInfo{
+ {Name: "bash", CallCount: 4},
+ {Name: "grep", CallCount: 3},
+ {Name: "gh", CallCount: 2},
+ {Name: "github_issue_read", CallCount: 2},
+ {Name: "sed", CallCount: 1},
+ {Name: "cat", CallCount: 1},
+ {Name: "jq", CallCount: 1},
+ }
+ domain := &TaskDomainInfo{Name: "research", Label: "Research"}
+ fingerprint := buildBehaviorFingerprint(processedRun, metrics, toolUsage, []CreatedItemReport{{Type: "create_issue"}}, nil)
+
+ assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, []CreatedItemReport{{Type: "create_issue"}}, domain, fingerprint, nil)
+
+ var found bool
+ for _, assessment := range assessments {
+ if assessment.Kind == "resource_heavy_for_domain" {
+ found = true
+ assert.Equal(t, "high", assessment.Severity)
+ }
+ }
+ assert.True(t, found, "resource heavy assessment should be present")
+}
+
+func TestBuildAuditDataIncludesAgenticAnalysis(t *testing.T) {
+ processedRun := ProcessedRun{
+ Run: WorkflowRun{
+ DatabaseID: 7,
+ WorkflowName: "Issue Triage",
+ WorkflowPath: ".github/workflows/issue-triage.yml",
+ Status: "completed",
+ Conclusion: "success",
+ Duration: 3 * time.Minute,
+ Turns: 3,
+ Event: "issues",
+ LogsPath: t.TempDir(),
+ },
+ }
+ metrics := LogMetrics{Turns: 3}
+
+ auditData := buildAuditData(processedRun, metrics, nil)
+ require.NotNil(t, auditData.TaskDomain, "task domain should be present")
+ require.NotNil(t, auditData.BehaviorFingerprint, "behavioral fingerprint should be present")
+ assert.NotEmpty(t, auditData.AgenticAssessments, "agentic assessments should be present")
+ assert.Equal(t, "triage", auditData.TaskDomain.Name)
+}
diff --git a/pkg/cli/audit_comparison.go b/pkg/cli/audit_comparison.go
new file mode 100644
index 0000000000..aedcabe3f7
--- /dev/null
+++ b/pkg/cli/audit_comparison.go
@@ -0,0 +1,542 @@
+package cli
+
+import (
+ "encoding/json"
+ "fmt"
+ "net/url"
+ "os"
+ "path/filepath"
+ "slices"
+ "sort"
+ "strings"
+
+ "github.com/github/gh-aw/pkg/workflow"
+)
+
+type AuditComparisonData struct {
+ BaselineFound bool `json:"baseline_found"`
+ Baseline *AuditComparisonBaseline `json:"baseline,omitempty"`
+ Delta *AuditComparisonDelta `json:"delta,omitempty"`
+ Classification *AuditComparisonClassification `json:"classification,omitempty"`
+ Recommendation *AuditComparisonRecommendation `json:"recommendation,omitempty"`
+}
+
+type AuditComparisonBaseline struct {
+ RunID int64 `json:"run_id"`
+ WorkflowName string `json:"workflow_name,omitempty"`
+ Conclusion string `json:"conclusion,omitempty"`
+ CreatedAt string `json:"created_at,omitempty"`
+ Selection string `json:"selection,omitempty"`
+ MatchedOn []string `json:"matched_on,omitempty"`
+}
+
+type AuditComparisonDelta struct {
+ Turns AuditComparisonIntDelta `json:"turns"`
+ Posture AuditComparisonStringDelta `json:"posture"`
+ BlockedRequests AuditComparisonIntDelta `json:"blocked_requests"`
+ MCPFailure *AuditComparisonMCPFailureDelta `json:"mcp_failure,omitempty"`
+}
+
+type AuditComparisonIntDelta struct {
+ Before int `json:"before"`
+ After int `json:"after"`
+ Changed bool `json:"changed"`
+}
+
+type AuditComparisonStringDelta struct {
+ Before string `json:"before"`
+ After string `json:"after"`
+ Changed bool `json:"changed"`
+}
+
+type AuditComparisonMCPFailureDelta struct {
+ Before []string `json:"before,omitempty"`
+ After []string `json:"after,omitempty"`
+ NewlyPresent bool `json:"newly_present"`
+}
+
+type AuditComparisonClassification struct {
+ Label string `json:"label"`
+ ReasonCodes []string `json:"reason_codes,omitempty"`
+}
+
+type AuditComparisonRecommendation struct {
+ Action string `json:"action"`
+}
+
+type auditComparisonSnapshot struct {
+ Turns int
+ Posture string
+ BlockedRequests int
+ MCPFailures []string
+}
+
+type auditComparisonCandidate struct {
+ Run WorkflowRun
+ Snapshot auditComparisonSnapshot
+ TaskDomain *TaskDomainInfo
+ BehaviorFingerprint *BehaviorFingerprint
+ Selection string
+ MatchedOn []string
+ Score int
+}
+
+const maxAuditComparisonCandidates = 10
+
+func buildAuditComparisonSnapshot(processedRun ProcessedRun, createdItems []CreatedItemReport) auditComparisonSnapshot {
+ blockedRequests := 0
+ if processedRun.FirewallAnalysis != nil {
+ blockedRequests = processedRun.FirewallAnalysis.BlockedRequests
+ }
+
+ return auditComparisonSnapshot{
+ Turns: processedRun.Run.Turns,
+ Posture: deriveAuditPosture(createdItems),
+ BlockedRequests: blockedRequests,
+ MCPFailures: collectMCPFailureServers(processedRun.MCPFailures),
+ }
+}
+
+func loadAuditComparisonSnapshotFromArtifacts(run WorkflowRun, logsPath string, verbose bool) (auditComparisonSnapshot, error) {
+ metrics, err := extractLogMetrics(logsPath, verbose, run.WorkflowPath)
+ if err != nil {
+ return auditComparisonSnapshot{}, fmt.Errorf("failed to extract baseline metrics: %w", err)
+ }
+
+ firewallAnalysis, err := analyzeFirewallLogs(logsPath, verbose)
+ if err != nil {
+ return auditComparisonSnapshot{}, fmt.Errorf("failed to analyze baseline firewall logs: %w", err)
+ }
+
+ mcpFailures, err := extractMCPFailuresFromRun(logsPath, run, verbose)
+ if err != nil {
+ return auditComparisonSnapshot{}, fmt.Errorf("failed to extract baseline MCP failures: %w", err)
+ }
+
+ blockedRequests := 0
+ if firewallAnalysis != nil {
+ blockedRequests = firewallAnalysis.BlockedRequests
+ }
+
+ return auditComparisonSnapshot{
+ Turns: metrics.Turns,
+ Posture: deriveAuditPosture(extractCreatedItemsFromManifest(logsPath)),
+ BlockedRequests: blockedRequests,
+ MCPFailures: collectMCPFailureServers(mcpFailures),
+ }, nil
+}
+
+func buildAuditComparisonCandidateFromSummary(summary *RunSummary, logsPath string) auditComparisonCandidate {
+ createdItems := extractCreatedItemsFromManifest(logsPath)
+ posture := deriveAuditPosture(createdItems)
+
+ blockedRequests := 0
+ if summary.FirewallAnalysis != nil {
+ blockedRequests = summary.FirewallAnalysis.BlockedRequests
+ }
+
+ return auditComparisonCandidate{
+ Run: summary.Run,
+ Snapshot: auditComparisonSnapshot{
+ Turns: summary.Metrics.Turns,
+ Posture: posture,
+ BlockedRequests: blockedRequests,
+ MCPFailures: collectMCPFailureServers(summary.MCPFailures),
+ },
+ TaskDomain: summary.TaskDomain,
+ BehaviorFingerprint: summary.BehaviorFingerprint,
+ }
+}
+
+func buildAuditComparisonCandidateFromProcessedRun(processedRun ProcessedRun) auditComparisonCandidate {
+ return auditComparisonCandidate{
+ Run: processedRun.Run,
+ Snapshot: buildAuditComparisonSnapshot(processedRun, extractCreatedItemsFromManifest(processedRun.Run.LogsPath)),
+ TaskDomain: processedRun.TaskDomain,
+ BehaviorFingerprint: processedRun.BehaviorFingerprint,
+ }
+}
+
+func loadAuditComparisonCandidate(run WorkflowRun, logsPath string, verbose bool) (auditComparisonCandidate, error) {
+ if summary, ok := loadRunSummary(logsPath, false); ok && summary != nil {
+ candidate := buildAuditComparisonCandidateFromSummary(summary, logsPath)
+ candidate.Run = run
+ return candidate, nil
+ }
+
+ snapshot, err := loadAuditComparisonSnapshotFromArtifacts(run, logsPath, verbose)
+ if err != nil {
+ return auditComparisonCandidate{}, err
+ }
+
+ processedRun := ProcessedRun{Run: run}
+ metrics, metricsErr := extractLogMetrics(logsPath, verbose, run.WorkflowPath)
+ if metricsErr == nil {
+ processedRun.Run.TokenUsage = metrics.TokenUsage
+ processedRun.Run.EstimatedCost = metrics.EstimatedCost
+ processedRun.Run.Turns = metrics.Turns
+ }
+ if firewallAnalysis, firewallErr := analyzeFirewallLogs(logsPath, verbose); firewallErr == nil {
+ processedRun.FirewallAnalysis = firewallAnalysis
+ }
+ if mcpFailures, mcpErr := extractMCPFailuresFromRun(logsPath, run, verbose); mcpErr == nil {
+ processedRun.MCPFailures = mcpFailures
+ }
+ awContext, _, _, taskDomain, behaviorFingerprint, _ := deriveRunAgenticAnalysis(processedRun, metrics)
+ processedRun.AwContext = awContext
+
+ return auditComparisonCandidate{
+ Run: run,
+ Snapshot: snapshot,
+ TaskDomain: taskDomain,
+ BehaviorFingerprint: behaviorFingerprint,
+ Selection: "latest_success",
+ MatchedOn: nil,
+ Score: 0,
+ }, nil
+}
+
+func scoreAuditComparisonCandidate(current ProcessedRun, candidate *auditComparisonCandidate) {
+ if candidate == nil {
+ return
+ }
+
+ score := 0
+ matchedOn := make([]string, 0, 6)
+
+ if current.Run.Event != "" && current.Run.Event == candidate.Run.Event {
+ score += 5
+ matchedOn = append(matchedOn, "event")
+ }
+
+ if current.TaskDomain != nil && candidate.TaskDomain != nil && current.TaskDomain.Name == candidate.TaskDomain.Name {
+ score += 50
+ matchedOn = append(matchedOn, "task_domain")
+ }
+
+ if current.BehaviorFingerprint != nil && candidate.BehaviorFingerprint != nil {
+ if current.BehaviorFingerprint.ExecutionStyle == candidate.BehaviorFingerprint.ExecutionStyle {
+ score += 20
+ matchedOn = append(matchedOn, "execution_style")
+ }
+ if current.BehaviorFingerprint.ResourceProfile == candidate.BehaviorFingerprint.ResourceProfile {
+ score += 25
+ matchedOn = append(matchedOn, "resource_profile")
+ }
+ if current.BehaviorFingerprint.ActuationStyle == candidate.BehaviorFingerprint.ActuationStyle {
+ score += 10
+ matchedOn = append(matchedOn, "actuation_style")
+ }
+ if current.BehaviorFingerprint.DispatchMode == candidate.BehaviorFingerprint.DispatchMode {
+ score += 5
+ matchedOn = append(matchedOn, "dispatch_mode")
+ }
+ if current.BehaviorFingerprint.ToolBreadth == candidate.BehaviorFingerprint.ToolBreadth {
+ score += 2
+ matchedOn = append(matchedOn, "tool_breadth")
+ }
+ }
+
+ candidate.Score = score
+ if slices.Contains(matchedOn, "task_domain") || slices.Contains(matchedOn, "execution_style") || slices.Contains(matchedOn, "resource_profile") || slices.Contains(matchedOn, "actuation_style") {
+ candidate.Selection = "cohort_match"
+ candidate.MatchedOn = matchedOn
+ return
+ }
+
+ candidate.Selection = "latest_success"
+ candidate.MatchedOn = nil
+}
+
+func selectAuditComparisonBaseline(current ProcessedRun, candidates []auditComparisonCandidate) *auditComparisonCandidate {
+ if len(candidates) == 0 {
+ return nil
+ }
+
+ for index := range candidates {
+ scoreAuditComparisonCandidate(current, &candidates[index])
+ }
+
+ sort.SliceStable(candidates, func(left, right int) bool {
+ if candidates[left].Score != candidates[right].Score {
+ return candidates[left].Score > candidates[right].Score
+ }
+ return candidates[left].Run.CreatedAt.After(candidates[right].Run.CreatedAt)
+ })
+
+ return &candidates[0]
+}
+
+func sameAuditComparisonWorkflow(left WorkflowRun, right WorkflowRun) bool {
+ if left.WorkflowPath != "" && right.WorkflowPath != "" {
+ return left.WorkflowPath == right.WorkflowPath
+ }
+ if left.WorkflowName != "" && right.WorkflowName != "" {
+ return left.WorkflowName == right.WorkflowName
+ }
+ return false
+}
+
+func buildAuditComparisonForProcessedRuns(currentRun ProcessedRun, processedRuns []ProcessedRun) *AuditComparisonData {
+ currentSnapshot := buildAuditComparisonSnapshot(currentRun, extractCreatedItemsFromManifest(currentRun.Run.LogsPath))
+ candidates := make([]auditComparisonCandidate, 0, len(processedRuns))
+
+ for _, candidateRun := range processedRuns {
+ if candidateRun.Run.DatabaseID == currentRun.Run.DatabaseID {
+ continue
+ }
+ if candidateRun.Run.Conclusion != "success" {
+ continue
+ }
+ if !candidateRun.Run.CreatedAt.Before(currentRun.Run.CreatedAt) {
+ continue
+ }
+ if !sameAuditComparisonWorkflow(currentRun.Run, candidateRun.Run) {
+ continue
+ }
+
+ candidates = append(candidates, buildAuditComparisonCandidateFromProcessedRun(candidateRun))
+ }
+
+ selected := selectAuditComparisonBaseline(currentRun, candidates)
+ if selected == nil {
+ return &AuditComparisonData{BaselineFound: false}
+ }
+
+ comparison := buildAuditComparison(currentSnapshot, &selected.Run, &selected.Snapshot)
+ if comparison != nil && comparison.Baseline != nil {
+ comparison.Baseline.Selection = selected.Selection
+ comparison.Baseline.MatchedOn = selected.MatchedOn
+ }
+ return comparison
+}
+
+func buildAuditComparison(current auditComparisonSnapshot, baselineRun *WorkflowRun, baseline *auditComparisonSnapshot) *AuditComparisonData {
+ if baselineRun == nil || baseline == nil {
+ return &AuditComparisonData{BaselineFound: false}
+ }
+
+ reasonCodes := make([]string, 0, 4)
+ delta := &AuditComparisonDelta{
+ Turns: AuditComparisonIntDelta{
+ Before: baseline.Turns,
+ After: current.Turns,
+ Changed: baseline.Turns != current.Turns,
+ },
+ Posture: AuditComparisonStringDelta{
+ Before: baseline.Posture,
+ After: current.Posture,
+ Changed: baseline.Posture != current.Posture,
+ },
+ BlockedRequests: AuditComparisonIntDelta{
+ Before: baseline.BlockedRequests,
+ After: current.BlockedRequests,
+ Changed: baseline.BlockedRequests != current.BlockedRequests,
+ },
+ }
+
+ if current.Turns > baseline.Turns {
+ reasonCodes = append(reasonCodes, "turns_increase")
+ } else if current.Turns < baseline.Turns {
+ reasonCodes = append(reasonCodes, "turns_decrease")
+ }
+ if baseline.Posture != current.Posture {
+ reasonCodes = append(reasonCodes, "posture_changed")
+ }
+ if current.BlockedRequests > baseline.BlockedRequests {
+ reasonCodes = append(reasonCodes, "blocked_requests_increase")
+ } else if current.BlockedRequests < baseline.BlockedRequests {
+ reasonCodes = append(reasonCodes, "blocked_requests_decrease")
+ }
+
+ newMCPFailure := len(baseline.MCPFailures) == 0 && len(current.MCPFailures) > 0
+ mcpFailuresResolved := len(baseline.MCPFailures) > 0 && len(current.MCPFailures) == 0
+ if newMCPFailure || len(baseline.MCPFailures) > 0 || len(current.MCPFailures) > 0 {
+ delta.MCPFailure = &AuditComparisonMCPFailureDelta{
+ Before: baseline.MCPFailures,
+ After: current.MCPFailures,
+ NewlyPresent: newMCPFailure,
+ }
+ }
+ if newMCPFailure {
+ reasonCodes = append(reasonCodes, "new_mcp_failure")
+ } else if mcpFailuresResolved {
+ reasonCodes = append(reasonCodes, "mcp_failures_resolved")
+ }
+
+ label := "stable"
+ switch {
+ case delta.Posture.Before == "read_only" && delta.Posture.After == "write_capable":
+ label = "risky"
+ case newMCPFailure:
+ label = "risky"
+ case current.BlockedRequests > baseline.BlockedRequests:
+ label = "risky"
+ case delta.Posture.Before != "" && delta.Posture.After != "" && delta.Posture.Before != delta.Posture.After:
+ label = "changed"
+ case mcpFailuresResolved:
+ label = "changed"
+ case current.BlockedRequests < baseline.BlockedRequests:
+ label = "changed"
+ case len(reasonCodes) > 0:
+ label = "changed"
+ }
+
+ return &AuditComparisonData{
+ BaselineFound: true,
+ Baseline: &AuditComparisonBaseline{
+ RunID: baselineRun.DatabaseID,
+ WorkflowName: baselineRun.WorkflowName,
+ Conclusion: baselineRun.Conclusion,
+ CreatedAt: baselineRun.CreatedAt.Format("2006-01-02T15:04:05Z07:00"),
+ Selection: "latest_success",
+ },
+ Delta: delta,
+ Classification: &AuditComparisonClassification{
+ Label: label,
+ ReasonCodes: reasonCodes,
+ },
+ Recommendation: &AuditComparisonRecommendation{
+ Action: recommendAuditComparisonAction(label, delta),
+ },
+ }
+}
+
+func recommendAuditComparisonAction(label string, delta *AuditComparisonDelta) string {
+ if delta == nil || label == "stable" {
+ return "No action needed; this run matches the selected successful baseline closely."
+ }
+
+ if delta.Posture.Before == "read_only" && delta.Posture.After == "write_capable" {
+ return "Review first-time write-capable behavior and add a guardrail before enabling by default."
+ }
+ if delta.MCPFailure != nil && delta.MCPFailure.NewlyPresent {
+ return "Inspect the new MCP failure and restore tool availability before relying on this workflow."
+ }
+ if delta.BlockedRequests.After > delta.BlockedRequests.Before {
+ return "Review network policy changes before treating the new blocked requests as normal behavior."
+ }
+ if delta.Turns.After > delta.Turns.Before {
+ return "Compare prompt or task-shape changes because this run needed more turns than the selected successful baseline."
+ }
+
+ return "Review the behavior change against the selected successful baseline before treating it as the new normal."
+}
+
+func deriveAuditPosture(createdItems []CreatedItemReport) string {
+ if len(createdItems) > 0 {
+ return "write_capable"
+ }
+ return "read_only"
+}
+
+func collectMCPFailureServers(failures []MCPFailureReport) []string {
+ if len(failures) == 0 {
+ return nil
+ }
+
+ serverSet := make(map[string]struct{}, len(failures))
+ for _, failure := range failures {
+ if strings.TrimSpace(failure.ServerName) == "" {
+ continue
+ }
+ serverSet[failure.ServerName] = struct{}{}
+ }
+
+ servers := make([]string, 0, len(serverSet))
+ for server := range serverSet {
+ servers = append(servers, server)
+ }
+ sort.Strings(servers)
+ return servers
+}
+
+func findPreviousSuccessfulWorkflowRuns(current WorkflowRun, owner, repo, hostname string, verbose bool) ([]WorkflowRun, error) {
+ _ = verbose
+ workflowID := filepath.Base(current.WorkflowPath)
+ if workflowID == "." || workflowID == "" {
+ return nil, fmt.Errorf("workflow path unavailable for run %d", current.DatabaseID)
+ }
+
+ encodedWorkflowID := url.PathEscape(workflowID)
+ var endpoint string
+ if owner != "" && repo != "" {
+ endpoint = fmt.Sprintf("repos/%s/%s/actions/workflows/%s/runs?per_page=%d", owner, repo, encodedWorkflowID, maxAuditComparisonCandidates)
+ } else {
+ endpoint = fmt.Sprintf("repos/{owner}/{repo}/actions/workflows/%s/runs?per_page=%d", encodedWorkflowID, maxAuditComparisonCandidates)
+ }
+
+ jq := fmt.Sprintf(`[.workflow_runs[] | select(.id != %d and .conclusion == "success" and .created_at < "%s") | {databaseId: .id, number: .run_number, url: .html_url, status: .status, conclusion: .conclusion, workflowName: .name, workflowPath: .path, createdAt: .created_at, startedAt: .run_started_at, updatedAt: .updated_at, event: .event, headBranch: .head_branch, headSha: .head_sha, displayTitle: .display_title}]`, current.DatabaseID, current.CreatedAt.Format("2006-01-02T15:04:05Z07:00"))
+
+ args := []string{"api"}
+ if hostname != "" && hostname != "github.com" {
+ args = append(args, "--hostname", hostname)
+ }
+ args = append(args, endpoint, "--jq", jq)
+
+ output, err := workflow.RunGHCombined("Fetching previous successful workflow run...", args...)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch previous successful workflow run: %w", err)
+ }
+
+ trimmed := strings.TrimSpace(string(output))
+ if trimmed == "null" || trimmed == "" || trimmed == "[]" {
+ return nil, nil
+ }
+
+ var runs []WorkflowRun
+ if err := json.Unmarshal(output, &runs); err != nil {
+ return nil, fmt.Errorf("failed to parse previous successful workflow runs: %w", err)
+ }
+
+ for index := range runs {
+ if strings.HasPrefix(runs[index].WorkflowName, ".github/") {
+ if displayName := resolveWorkflowDisplayName(runs[index].WorkflowPath, owner, repo, hostname); displayName != "" {
+ runs[index].WorkflowName = displayName
+ }
+ }
+ }
+
+ return runs, nil
+}
+
+func buildAuditComparisonForRun(currentRun ProcessedRun, currentSnapshot auditComparisonSnapshot, outputDir string, owner, repo, hostname string, verbose bool) *AuditComparisonData {
+ baselineRuns, err := findPreviousSuccessfulWorkflowRuns(currentRun.Run, owner, repo, hostname, verbose)
+ if err != nil {
+ auditLog.Printf("Skipping audit comparison: failed to find baseline: %v", err)
+ return &AuditComparisonData{BaselineFound: false}
+ }
+ if len(baselineRuns) == 0 {
+ return &AuditComparisonData{BaselineFound: false}
+ }
+
+ candidates := make([]auditComparisonCandidate, 0, len(baselineRuns))
+ for _, baselineRun := range baselineRuns {
+ baselineOutputDir := filepath.Join(outputDir, fmt.Sprintf("baseline-%d", baselineRun.DatabaseID))
+ if _, err := os.Stat(baselineOutputDir); err != nil {
+ if downloadErr := downloadRunArtifacts(baselineRun.DatabaseID, baselineOutputDir, verbose, owner, repo, hostname); downloadErr != nil {
+ auditLog.Printf("Skipping candidate baseline for run %d: failed to download baseline artifacts: %v", baselineRun.DatabaseID, downloadErr)
+ continue
+ }
+ }
+
+ candidate, candidateErr := loadAuditComparisonCandidate(baselineRun, baselineOutputDir, verbose)
+ if candidateErr != nil {
+ auditLog.Printf("Skipping candidate baseline for run %d: failed to load baseline snapshot: %v", baselineRun.DatabaseID, candidateErr)
+ continue
+ }
+ candidates = append(candidates, candidate)
+ }
+
+ selected := selectAuditComparisonBaseline(currentRun, candidates)
+ if selected == nil {
+ return &AuditComparisonData{BaselineFound: false}
+ }
+
+ comparison := buildAuditComparison(currentSnapshot, &selected.Run, &selected.Snapshot)
+ if comparison != nil && comparison.Baseline != nil {
+ comparison.Baseline.Selection = selected.Selection
+ comparison.Baseline.MatchedOn = selected.MatchedOn
+ }
+ return comparison
+}
diff --git a/pkg/cli/audit_comparison_test.go b/pkg/cli/audit_comparison_test.go
new file mode 100644
index 0000000000..3296140dc0
--- /dev/null
+++ b/pkg/cli/audit_comparison_test.go
@@ -0,0 +1,137 @@
+//go:build !integration
+
+package cli
+
+import (
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestBuildAuditComparison_NoBaseline(t *testing.T) {
+ comparison := buildAuditComparison(auditComparisonSnapshot{Turns: 4, Posture: "read_only"}, nil, nil)
+ require.NotNil(t, comparison, "comparison should still be returned when no baseline exists")
+ assert.False(t, comparison.BaselineFound, "baseline should be marked unavailable")
+ assert.Nil(t, comparison.Baseline, "baseline details should be omitted")
+ assert.Nil(t, comparison.Delta, "delta should be omitted when no baseline exists")
+ assert.Nil(t, comparison.Classification, "classification should be omitted when no baseline exists")
+}
+
+func TestBuildAuditComparison_RiskyChange(t *testing.T) {
+ baselineRun := &WorkflowRun{
+ DatabaseID: 100,
+ WorkflowName: "triage",
+ Conclusion: "success",
+ CreatedAt: time.Date(2026, 3, 20, 12, 0, 0, 0, time.UTC),
+ }
+
+ comparison := buildAuditComparison(
+ auditComparisonSnapshot{Turns: 11, Posture: "write_capable", BlockedRequests: 7, MCPFailures: []string{"github"}},
+ baselineRun,
+ &auditComparisonSnapshot{Turns: 4, Posture: "read_only", BlockedRequests: 0},
+ )
+
+ require.NotNil(t, comparison, "comparison should be built")
+ require.True(t, comparison.BaselineFound, "baseline should be marked available")
+ require.NotNil(t, comparison.Delta, "delta should be present")
+ require.NotNil(t, comparison.Classification, "classification should be present")
+ require.NotNil(t, comparison.Recommendation, "recommendation should be present")
+
+ assert.Equal(t, "risky", comparison.Classification.Label, "write-capable transition should be risky")
+ assert.Contains(t, comparison.Classification.ReasonCodes, "turns_increase")
+ assert.Contains(t, comparison.Classification.ReasonCodes, "posture_changed")
+ assert.Contains(t, comparison.Classification.ReasonCodes, "blocked_requests_increase")
+ assert.Contains(t, comparison.Classification.ReasonCodes, "new_mcp_failure")
+ assert.Equal(t, 4, comparison.Delta.Turns.Before)
+ assert.Equal(t, 11, comparison.Delta.Turns.After)
+ assert.Equal(t, "read_only", comparison.Delta.Posture.Before)
+ assert.Equal(t, "write_capable", comparison.Delta.Posture.After)
+ assert.True(t, comparison.Delta.MCPFailure.NewlyPresent, "new MCP failure should be marked")
+ assert.Contains(t, comparison.Recommendation.Action, "write-capable", "recommendation should address the risky posture change")
+}
+
+func TestBuildAuditComparison_StableRun(t *testing.T) {
+ baselineRun := &WorkflowRun{DatabaseID: 99, WorkflowName: "triage", Conclusion: "success", CreatedAt: time.Now().Add(-time.Hour)}
+ comparison := buildAuditComparison(
+ auditComparisonSnapshot{Turns: 4, Posture: "read_only", BlockedRequests: 0},
+ baselineRun,
+ &auditComparisonSnapshot{Turns: 4, Posture: "read_only", BlockedRequests: 0},
+ )
+
+ require.NotNil(t, comparison.Classification, "classification should be present")
+ assert.Equal(t, "stable", comparison.Classification.Label, "unchanged runs should be stable")
+ assert.Empty(t, comparison.Classification.ReasonCodes, "stable runs should have no reason codes")
+ assert.Contains(t, comparison.Recommendation.Action, "No action needed", "stable runs should produce a no-op recommendation")
+}
+
+func TestSelectAuditComparisonBaselinePrefersCohortMatchOverRecency(t *testing.T) {
+ current := ProcessedRun{
+ Run: WorkflowRun{
+ Event: "issues",
+ },
+ TaskDomain: &TaskDomainInfo{Name: "triage", Label: "Triage"},
+ BehaviorFingerprint: &BehaviorFingerprint{
+ ExecutionStyle: "directed",
+ ToolBreadth: "narrow",
+ ActuationStyle: "read_only",
+ ResourceProfile: "lean",
+ DispatchMode: "standalone",
+ },
+ }
+
+ candidates := []auditComparisonCandidate{
+ {
+ Run: WorkflowRun{
+ DatabaseID: 200,
+ CreatedAt: time.Date(2026, 3, 23, 12, 0, 0, 0, time.UTC),
+ Event: "push",
+ },
+ TaskDomain: &TaskDomainInfo{Name: "release_ops", Label: "Release / Ops"},
+ BehaviorFingerprint: &BehaviorFingerprint{
+ ExecutionStyle: "adaptive",
+ ToolBreadth: "moderate",
+ ActuationStyle: "selective_write",
+ ResourceProfile: "moderate",
+ DispatchMode: "standalone",
+ },
+ },
+ {
+ Run: WorkflowRun{
+ DatabaseID: 150,
+ CreatedAt: time.Date(2026, 3, 22, 12, 0, 0, 0, time.UTC),
+ Event: "issues",
+ },
+ TaskDomain: &TaskDomainInfo{Name: "triage", Label: "Triage"},
+ BehaviorFingerprint: &BehaviorFingerprint{
+ ExecutionStyle: "directed",
+ ToolBreadth: "narrow",
+ ActuationStyle: "read_only",
+ ResourceProfile: "lean",
+ DispatchMode: "standalone",
+ },
+ },
+ }
+
+ selected := selectAuditComparisonBaseline(current, candidates)
+ require.NotNil(t, selected, "baseline should be selected")
+ assert.Equal(t, int64(150), selected.Run.DatabaseID, "cohort-matching run should beat the more recent but behaviorally different run")
+ assert.Equal(t, "cohort_match", selected.Selection)
+ assert.Contains(t, selected.MatchedOn, "task_domain")
+ assert.Contains(t, selected.MatchedOn, "resource_profile")
+ assert.Positive(t, selected.Score, "cohort match should have a positive score")
+}
+
+func TestScoreAuditComparisonCandidateFallsBackToLatestSuccess(t *testing.T) {
+ current := ProcessedRun{Run: WorkflowRun{Event: "issues"}}
+ candidate := auditComparisonCandidate{
+ Run: WorkflowRun{DatabaseID: 300, CreatedAt: time.Date(2026, 3, 21, 12, 0, 0, 0, time.UTC), Event: "push"},
+ }
+
+ scoreAuditComparisonCandidate(current, &candidate)
+
+ assert.Equal(t, 0, candidate.Score)
+ assert.Equal(t, "latest_success", candidate.Selection)
+ assert.Nil(t, candidate.MatchedOn)
+}
diff --git a/pkg/cli/audit_report.go b/pkg/cli/audit_report.go
index cbbd36697a..e6de59671c 100644
--- a/pkg/cli/audit_report.go
+++ b/pkg/cli/audit_report.go
@@ -21,9 +21,14 @@ var auditReportLog = logger.New("cli:audit_report")
// AuditData represents the complete structured audit data for a workflow run
type AuditData struct {
Overview OverviewData `json:"overview"`
+ Comparison *AuditComparisonData `json:"comparison,omitempty"`
+ TaskDomain *TaskDomainInfo `json:"task_domain,omitempty"`
+ BehaviorFingerprint *BehaviorFingerprint `json:"behavior_fingerprint,omitempty"`
+ AgenticAssessments []AgenticAssessment `json:"agentic_assessments,omitempty"`
Metrics MetricsData `json:"metrics"`
KeyFindings []Finding `json:"key_findings,omitempty"`
Recommendations []Recommendation `json:"recommendations,omitempty"`
+ ObservabilityInsights []ObservabilityInsight `json:"observability_insights,omitempty"`
PerformanceMetrics *PerformanceMetrics `json:"performance_metrics,omitempty"`
Jobs []JobData `json:"jobs,omitempty"`
DownloadedFiles []FileInfo `json:"downloaded_files"`
@@ -227,18 +232,17 @@ func buildAuditData(processedRun ProcessedRun, metrics LogMetrics, mcpToolUsage
overview.LogsPath = run.LogsPath
}
- // Parse aw_info.json to extract aw_context if present
+ if run.Duration > 0 {
+ overview.Duration = timeutil.FormatDuration(run.Duration)
+ }
+
if run.LogsPath != "" {
awInfoPath := filepath.Join(run.LogsPath, "aw_info.json")
- if info, err := parseAwInfo(awInfoPath, false); err == nil && info != nil {
- overview.AwContext = info.Context
+ if awInfo, err := parseAwInfo(awInfoPath, false); err == nil && awInfo != nil {
+ overview.AwContext = awInfo.Context
}
}
- if run.Duration > 0 {
- overview.Duration = timeutil.FormatDuration(run.Duration)
- }
-
// Build metrics
metricsData := MetricsData{
TokenUsage: run.TokenUsage,
@@ -276,8 +280,6 @@ func buildAuditData(processedRun ProcessedRun, metrics LogMetrics, mcpToolUsage
}
}
- // Build tool usage
- var toolUsage []ToolUsageInfo
toolStats := make(map[string]*ToolUsageInfo)
for _, toolCall := range metrics.ToolCalls {
displayKey := workflow.PrettifyToolName(toolCall.Name)
@@ -290,33 +292,45 @@ func buildAuditData(processedRun ProcessedRun, metrics LogMetrics, mcpToolUsage
existing.MaxOutputSize = toolCall.MaxOutputSize
}
if toolCall.MaxDuration > 0 {
- maxDur := timeutil.FormatDuration(toolCall.MaxDuration)
+ maxDuration := timeutil.FormatDuration(toolCall.MaxDuration)
if existing.MaxDuration == "" || toolCall.MaxDuration > parseDurationString(existing.MaxDuration) {
- existing.MaxDuration = maxDur
+ existing.MaxDuration = maxDuration
}
}
- } else {
- info := &ToolUsageInfo{
- Name: displayKey,
- CallCount: toolCall.CallCount,
- MaxInputSize: toolCall.MaxInputSize,
- MaxOutputSize: toolCall.MaxOutputSize,
- }
- if toolCall.MaxDuration > 0 {
- info.MaxDuration = timeutil.FormatDuration(toolCall.MaxDuration)
- }
- toolStats[displayKey] = info
+ continue
+ }
+
+ toolInfo := &ToolUsageInfo{
+ Name: displayKey,
+ CallCount: toolCall.CallCount,
+ MaxInputSize: toolCall.MaxInputSize,
+ MaxOutputSize: toolCall.MaxOutputSize,
}
+ if toolCall.MaxDuration > 0 {
+ toolInfo.MaxDuration = timeutil.FormatDuration(toolCall.MaxDuration)
+ }
+ toolStats[displayKey] = toolInfo
}
+
+ toolUsage := make([]ToolUsageInfo, 0, len(toolStats))
for _, info := range toolStats {
toolUsage = append(toolUsage, *info)
}
+ createdItems := extractCreatedItemsFromManifest(run.LogsPath)
+ taskDomain := detectTaskDomain(processedRun, createdItems, toolUsage, overview.AwContext)
+ behaviorFingerprint := buildBehaviorFingerprint(processedRun, metricsData, toolUsage, createdItems, overview.AwContext)
+ agenticAssessments := buildAgenticAssessments(processedRun, metricsData, toolUsage, createdItems, taskDomain, behaviorFingerprint, overview.AwContext)
+
// Generate key findings
findings := generateFindings(processedRun, metricsData, errors, warnings)
+ findings = append(findings, generateAgenticAssessmentFindings(agenticAssessments)...)
// Generate recommendations
recommendations := generateRecommendations(processedRun, metricsData, findings)
+ recommendations = append(recommendations, generateAgenticAssessmentRecommendations(agenticAssessments)...)
+
+ observabilityInsights := buildAuditObservabilityInsights(processedRun, metricsData, toolUsage, createdItems)
// Generate performance metrics
performanceMetrics := generatePerformanceMetrics(processedRun, metricsData, toolUsage)
@@ -328,9 +342,13 @@ func buildAuditData(processedRun ProcessedRun, metrics LogMetrics, mcpToolUsage
return AuditData{
Overview: overview,
+ TaskDomain: taskDomain,
+ BehaviorFingerprint: behaviorFingerprint,
+ AgenticAssessments: agenticAssessments,
Metrics: metricsData,
KeyFindings: findings,
Recommendations: recommendations,
+ ObservabilityInsights: observabilityInsights,
PerformanceMetrics: performanceMetrics,
Jobs: jobs,
DownloadedFiles: downloadedFiles,
@@ -345,7 +363,7 @@ func buildAuditData(processedRun ProcessedRun, metrics LogMetrics, mcpToolUsage
Warnings: warnings,
ToolUsage: toolUsage,
MCPToolUsage: mcpToolUsage,
- CreatedItems: extractCreatedItemsFromManifest(run.LogsPath),
+ CreatedItems: createdItems,
}
}
diff --git a/pkg/cli/audit_report_render.go b/pkg/cli/audit_report_render.go
index cc45503526..0db8055c38 100644
--- a/pkg/cli/audit_report_render.go
+++ b/pkg/cli/audit_report_render.go
@@ -7,6 +7,7 @@ import (
"os"
"path/filepath"
"strconv"
+ "strings"
"time"
"github.com/github/gh-aw/pkg/console"
@@ -33,6 +34,30 @@ func renderConsole(data AuditData, logsPath string) {
fmt.Fprintln(os.Stderr)
renderOverview(data.Overview)
+ if data.Comparison != nil {
+ fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Comparison To Similar Successful Run"))
+ fmt.Fprintln(os.Stderr)
+ renderAuditComparison(data.Comparison)
+ }
+
+ if data.TaskDomain != nil {
+ fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Detected Task Domain"))
+ fmt.Fprintln(os.Stderr)
+ renderTaskDomain(data.TaskDomain)
+ }
+
+ if data.BehaviorFingerprint != nil {
+ fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Behavioral Fingerprint"))
+ fmt.Fprintln(os.Stderr)
+ renderBehaviorFingerprint(data.BehaviorFingerprint)
+ }
+
+ if len(data.AgenticAssessments) > 0 {
+ fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Agentic Assessment"))
+ fmt.Fprintln(os.Stderr)
+ renderAgenticAssessments(data.AgenticAssessments)
+ }
+
// Key Findings Section - NEW
if len(data.KeyFindings) > 0 {
auditReportLog.Printf("Rendering %d key findings", len(data.KeyFindings))
@@ -49,6 +74,12 @@ func renderConsole(data AuditData, logsPath string) {
renderRecommendations(data.Recommendations)
}
+ if len(data.ObservabilityInsights) > 0 {
+ fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Observability Insights"))
+ fmt.Fprintln(os.Stderr)
+ renderObservabilityInsights(data.ObservabilityInsights)
+ }
+
// Performance Metrics Section - NEW
if data.PerformanceMetrics != nil {
fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Performance Metrics"))
@@ -188,6 +219,52 @@ func renderConsole(data AuditData, logsPath string) {
fmt.Fprintln(os.Stderr)
}
+func renderAuditComparison(comparison *AuditComparisonData) {
+ if comparison == nil {
+ return
+ }
+
+ if !comparison.BaselineFound || comparison.Baseline == nil || comparison.Delta == nil || comparison.Classification == nil {
+ fmt.Fprintln(os.Stderr, " No suitable successful run was available for baseline comparison.")
+ fmt.Fprintln(os.Stderr)
+ return
+ }
+
+ fmt.Fprintf(os.Stderr, " Baseline: run %d", comparison.Baseline.RunID)
+ if comparison.Baseline.Conclusion != "" {
+ fmt.Fprintf(os.Stderr, " (%s)", comparison.Baseline.Conclusion)
+ }
+ fmt.Fprintln(os.Stderr)
+ if comparison.Baseline.Selection != "" {
+ fmt.Fprintf(os.Stderr, " Selection: %s\n", strings.ReplaceAll(comparison.Baseline.Selection, "_", " "))
+ }
+ if len(comparison.Baseline.MatchedOn) > 0 {
+ fmt.Fprintf(os.Stderr, " Matched on: %s\n", strings.Join(comparison.Baseline.MatchedOn, ", "))
+ }
+ fmt.Fprintf(os.Stderr, " Classification: %s\n", comparison.Classification.Label)
+ fmt.Fprintln(os.Stderr, " Changes:")
+
+ if comparison.Delta.Turns.Changed {
+ fmt.Fprintf(os.Stderr, " - Turns: %d -> %d\n", comparison.Delta.Turns.Before, comparison.Delta.Turns.After)
+ }
+ if comparison.Delta.Posture.Changed {
+ fmt.Fprintf(os.Stderr, " - Posture: %s -> %s\n", comparison.Delta.Posture.Before, comparison.Delta.Posture.After)
+ }
+ if comparison.Delta.BlockedRequests.Changed {
+ fmt.Fprintf(os.Stderr, " - Blocked requests: %d -> %d\n", comparison.Delta.BlockedRequests.Before, comparison.Delta.BlockedRequests.After)
+ }
+ if comparison.Delta.MCPFailure != nil && comparison.Delta.MCPFailure.NewlyPresent {
+ fmt.Fprintf(os.Stderr, " - New MCP failure: %s\n", strings.Join(comparison.Delta.MCPFailure.After, ", "))
+ }
+ if len(comparison.Classification.ReasonCodes) == 0 {
+ fmt.Fprintln(os.Stderr, " - No meaningful behavior change from the selected successful baseline")
+ }
+ if comparison.Recommendation != nil && comparison.Recommendation.Action != "" {
+ fmt.Fprintf(os.Stderr, " Recommended action: %s\n", comparison.Recommendation.Action)
+ }
+ fmt.Fprintln(os.Stderr)
+}
+
// renderOverview renders the overview section using the new rendering system
func renderOverview(overview OverviewData) {
// Format Status with optional Conclusion
@@ -215,6 +292,56 @@ func renderMetrics(metrics MetricsData) {
fmt.Fprint(os.Stderr, console.RenderStruct(metrics))
}
+type taskDomainDisplay struct {
+ Domain string `console:"header:Domain"`
+ Reason string `console:"header:Reason"`
+}
+
+type behaviorFingerprintDisplay struct {
+ Execution string `console:"header:Execution"`
+ Tools string `console:"header:Tools"`
+ Actuation string `console:"header:Actuation"`
+ Resource string `console:"header:Resources"`
+ Dispatch string `console:"header:Dispatch"`
+}
+
+func renderTaskDomain(domain *TaskDomainInfo) {
+ if domain == nil {
+ return
+ }
+ fmt.Fprint(os.Stderr, console.RenderStruct(taskDomainDisplay{
+ Domain: domain.Label,
+ Reason: domain.Reason,
+ }))
+}
+
+func renderBehaviorFingerprint(fingerprint *BehaviorFingerprint) {
+ if fingerprint == nil {
+ return
+ }
+ fmt.Fprint(os.Stderr, console.RenderStruct(behaviorFingerprintDisplay{
+ Execution: fingerprint.ExecutionStyle,
+ Tools: fingerprint.ToolBreadth,
+ Actuation: fingerprint.ActuationStyle,
+ Resource: fingerprint.ResourceProfile,
+ Dispatch: fingerprint.DispatchMode,
+ }))
+}
+
+func renderAgenticAssessments(assessments []AgenticAssessment) {
+ for _, assessment := range assessments {
+ severity := strings.ToUpper(assessment.Severity)
+ fmt.Fprintf(os.Stderr, " [%s] %s\n", severity, assessment.Summary)
+ if assessment.Evidence != "" {
+ fmt.Fprintf(os.Stderr, " Evidence: %s\n", assessment.Evidence)
+ }
+ if assessment.Recommendation != "" {
+ fmt.Fprintf(os.Stderr, " Recommendation: %s\n", assessment.Recommendation)
+ }
+ fmt.Fprintln(os.Stderr)
+ }
+}
+
// renderJobsTable renders the jobs as a table using console.RenderTable
func renderJobsTable(jobs []JobData) {
auditReportLog.Printf("Rendering jobs table with %d jobs", len(jobs))
diff --git a/pkg/cli/audit_test.go b/pkg/cli/audit_test.go
index b4b48842cf..836b1d32ce 100644
--- a/pkg/cli/audit_test.go
+++ b/pkg/cli/audit_test.go
@@ -130,6 +130,7 @@ func TestBuildAuditData(t *testing.T) {
// Build audit data
auditData := buildAuditData(processedRun, metrics, nil)
+ auditData.Comparison = &AuditComparisonData{BaselineFound: false}
// Verify overview
if auditData.Overview.RunID != 123456 {
@@ -163,6 +164,10 @@ func TestBuildAuditData(t *testing.T) {
t.Errorf("Expected warning count 1, got %d", auditData.Metrics.WarningCount)
}
+ if auditData.Comparison == nil {
+ t.Error("Expected comparison field to be assignable on audit data")
+ }
+
// Note: Error and warning extraction was removed from buildAuditData
// The error/warning counts in metrics are preserved but individual error/warning
// extraction via pattern matching is no longer performed
diff --git a/pkg/cli/logs_ci_scenario_test.go b/pkg/cli/logs_ci_scenario_test.go
index 607d4f5d6f..04cd6dae56 100644
--- a/pkg/cli/logs_ci_scenario_test.go
+++ b/pkg/cli/logs_ci_scenario_test.go
@@ -248,12 +248,19 @@ func TestLogsJSONOutputStructure(t *testing.T) {
if _, exists := parsed["runs"]; !exists {
t.Error("Missing 'runs' field in JSON output")
}
+ if _, exists := parsed["episodes"]; !exists {
+ t.Error("Missing 'episodes' field in JSON output")
+ }
+ if _, exists := parsed["edges"]; !exists {
+ t.Error("Missing 'edges' field in JSON output")
+ }
// Verify summary has all required fields
summary := parsed["summary"].(map[string]any)
requiredFields := []string{
"total_runs", "total_duration", "total_tokens", "total_cost",
"total_turns", "total_errors", "total_warnings", "total_missing_tools",
+ "total_episodes", "high_confidence_episodes",
}
for _, field := range requiredFields {
@@ -270,6 +277,22 @@ func TestLogsJSONOutputStructure(t *testing.T) {
if len(runs) != 0 {
t.Errorf("Expected empty runs array, got %d runs", len(runs))
}
+
+ episodes, ok := parsed["episodes"].([]any)
+ if !ok {
+ t.Errorf("Expected 'episodes' to be an array, got %T", parsed["episodes"])
+ }
+ if len(episodes) != 0 {
+ t.Errorf("Expected empty episodes array, got %d episodes", len(episodes))
+ }
+
+ edges, ok := parsed["edges"].([]any)
+ if !ok {
+ t.Errorf("Expected 'edges' to be an array, got %T", parsed["edges"])
+ }
+ if len(edges) != 0 {
+ t.Errorf("Expected empty edges array, got %d edges", len(edges))
+ }
}
// TestSummaryFileWrittenWithNoRuns verifies that the summary.json file is created
diff --git a/pkg/cli/logs_episode.go b/pkg/cli/logs_episode.go
new file mode 100644
index 0000000000..17b2ef3d07
--- /dev/null
+++ b/pkg/cli/logs_episode.go
@@ -0,0 +1,297 @@
+package cli
+
+import (
+ "cmp"
+ "fmt"
+ "slices"
+ "strconv"
+ "time"
+
+ "github.com/github/gh-aw/pkg/timeutil"
+)
+
+// EpisodeEdge represents a deterministic lineage edge between two workflow runs.
+type EpisodeEdge struct {
+ SourceRunID int64 `json:"source_run_id"`
+ TargetRunID int64 `json:"target_run_id"`
+ EdgeType string `json:"edge_type"`
+ Confidence string `json:"confidence"`
+ Reasons []string `json:"reasons,omitempty"`
+ SourceRepo string `json:"source_repo,omitempty"`
+ SourceRef string `json:"source_ref,omitempty"`
+ EventType string `json:"event_type,omitempty"`
+ EpisodeID string `json:"episode_id,omitempty"`
+}
+
+// EpisodeData represents a deterministic episode rollup derived from workflow runs.
+type EpisodeData struct {
+ EpisodeID string `json:"episode_id"`
+ Kind string `json:"kind"`
+ Confidence string `json:"confidence"`
+ Reasons []string `json:"reasons,omitempty"`
+ RootRunID int64 `json:"root_run_id,omitempty"`
+ RunIDs []int64 `json:"run_ids"`
+ WorkflowNames []string `json:"workflow_names"`
+ TotalRuns int `json:"total_runs"`
+ TotalTokens int `json:"total_tokens"`
+ TotalEstimatedCost float64 `json:"total_estimated_cost"`
+ TotalDuration string `json:"total_duration"`
+ RiskyNodeCount int `json:"risky_node_count"`
+ WriteCapableNodeCount int `json:"write_capable_node_count"`
+ MissingToolCount int `json:"missing_tool_count"`
+ MCPFailureCount int `json:"mcp_failure_count"`
+ BlockedRequestCount int `json:"blocked_request_count"`
+ RiskDistribution string `json:"risk_distribution"`
+}
+
+type episodeAccumulator struct {
+ metadata EpisodeData
+ duration time.Duration
+ runSet map[int64]bool
+ nameSet map[string]bool
+ rootTime time.Time
+}
+
+type episodeSeed struct {
+ EpisodeID string
+ Kind string
+ Confidence string
+ Reasons []string
+}
+
+func buildEpisodeData(runs []RunData, processedRuns []ProcessedRun) ([]EpisodeData, []EpisodeEdge) {
+ runsByID := make(map[int64]RunData, len(runs))
+ processedByID := make(map[int64]ProcessedRun, len(processedRuns))
+ seedsByRunID := make(map[int64]episodeSeed, len(runs))
+ parents := make(map[int64]int64, len(runs))
+ for _, run := range runs {
+ runsByID[run.DatabaseID] = run
+ episodeID, kind, confidence, reasons := classifyEpisode(run)
+ seedsByRunID[run.DatabaseID] = episodeSeed{EpisodeID: episodeID, Kind: kind, Confidence: confidence, Reasons: append([]string(nil), reasons...)}
+ parents[run.DatabaseID] = run.DatabaseID
+ }
+ for _, processedRun := range processedRuns {
+ processedByID[processedRun.Run.DatabaseID] = processedRun
+ }
+
+ edges := make([]EpisodeEdge, 0)
+ for _, run := range runs {
+ if edge, ok := buildEpisodeEdge(run, seedsByRunID[run.DatabaseID].EpisodeID, runsByID); ok {
+ edges = append(edges, edge)
+ unionEpisodes(parents, edge.SourceRunID, edge.TargetRunID)
+ }
+ }
+
+ episodeMap := make(map[string]*episodeAccumulator)
+ rootMetadata := make(map[int64]episodeSeed)
+ for _, run := range runs {
+ root := findEpisodeParent(parents, run.DatabaseID)
+ seed := seedsByRunID[run.DatabaseID]
+ best, exists := rootMetadata[root]
+ if !exists || compareEpisodeSeeds(seed, best) > 0 {
+ rootMetadata[root] = seed
+ }
+ }
+
+ for _, run := range runs {
+ root := findEpisodeParent(parents, run.DatabaseID)
+ selectedSeed := rootMetadata[root]
+ episodeID, kind, confidence, reasons := selectedSeed.EpisodeID, selectedSeed.Kind, selectedSeed.Confidence, selectedSeed.Reasons
+ acc, exists := episodeMap[episodeID]
+ if !exists {
+ acc = &episodeAccumulator{
+ metadata: EpisodeData{
+ EpisodeID: episodeID,
+ Kind: kind,
+ Confidence: confidence,
+ Reasons: append([]string(nil), reasons...),
+ RunIDs: []int64{},
+ WorkflowNames: []string{},
+ RiskDistribution: "none",
+ },
+ runSet: make(map[int64]bool),
+ nameSet: make(map[string]bool),
+ rootTime: run.CreatedAt,
+ }
+ episodeMap[episodeID] = acc
+ }
+
+ if !acc.runSet[run.DatabaseID] {
+ acc.runSet[run.DatabaseID] = true
+ acc.metadata.RunIDs = append(acc.metadata.RunIDs, run.DatabaseID)
+ }
+ if run.WorkflowName != "" && !acc.nameSet[run.WorkflowName] {
+ acc.nameSet[run.WorkflowName] = true
+ acc.metadata.WorkflowNames = append(acc.metadata.WorkflowNames, run.WorkflowName)
+ }
+
+ acc.metadata.TotalRuns++
+ acc.metadata.TotalTokens += run.TokenUsage
+ acc.metadata.TotalEstimatedCost += run.EstimatedCost
+ if run.Comparison != nil && run.Comparison.Classification != nil && run.Comparison.Classification.Label == "risky" {
+ acc.metadata.RiskyNodeCount++
+ }
+ if run.BehaviorFingerprint != nil && run.BehaviorFingerprint.ActuationStyle != "read_only" {
+ acc.metadata.WriteCapableNodeCount++
+ }
+ acc.metadata.MissingToolCount += run.MissingToolCount
+ if pr, ok := processedByID[run.DatabaseID]; ok {
+ acc.metadata.MCPFailureCount += len(pr.MCPFailures)
+ if pr.FirewallAnalysis != nil {
+ acc.metadata.BlockedRequestCount += pr.FirewallAnalysis.BlockedRequests
+ }
+ }
+ if !run.CreatedAt.IsZero() && (acc.metadata.RootRunID == 0 || run.CreatedAt.Before(acc.rootTime)) {
+ acc.rootTime = run.CreatedAt
+ acc.metadata.RootRunID = run.DatabaseID
+ }
+ if run.StartedAt.IsZero() && run.UpdatedAt.IsZero() {
+ acc.duration += run.CreatedAt.Sub(run.CreatedAt)
+ } else if !run.StartedAt.IsZero() && !run.UpdatedAt.IsZero() && run.UpdatedAt.After(run.StartedAt) {
+ acc.duration += run.UpdatedAt.Sub(run.StartedAt)
+ } else if pr, ok := processedByID[run.DatabaseID]; ok && pr.Run.Duration > 0 {
+ acc.duration += pr.Run.Duration
+ }
+ }
+
+ for index := range edges {
+ root := findEpisodeParent(parents, edges[index].TargetRunID)
+ if selectedSeed, ok := rootMetadata[root]; ok {
+ edges[index].EpisodeID = selectedSeed.EpisodeID
+ }
+ }
+
+ episodes := make([]EpisodeData, 0, len(episodeMap))
+ for _, acc := range episodeMap {
+ slices.Sort(acc.metadata.RunIDs)
+ slices.Sort(acc.metadata.WorkflowNames)
+ if acc.duration > 0 {
+ acc.metadata.TotalDuration = timeutil.FormatDuration(acc.duration)
+ }
+ switch acc.metadata.RiskyNodeCount {
+ case 0:
+ acc.metadata.RiskDistribution = "none"
+ case 1:
+ acc.metadata.RiskDistribution = "concentrated"
+ default:
+ acc.metadata.RiskDistribution = "distributed"
+ }
+ episodes = append(episodes, acc.metadata)
+ }
+
+ slices.SortFunc(episodes, func(a, b EpisodeData) int {
+ if a.RootRunID != b.RootRunID {
+ return cmp.Compare(a.RootRunID, b.RootRunID)
+ }
+ return cmp.Compare(a.EpisodeID, b.EpisodeID)
+ })
+ slices.SortFunc(edges, func(a, b EpisodeEdge) int {
+ if a.SourceRunID != b.SourceRunID {
+ return cmp.Compare(a.SourceRunID, b.SourceRunID)
+ }
+ return cmp.Compare(a.TargetRunID, b.TargetRunID)
+ })
+
+ return episodes, edges
+}
+
+func findEpisodeParent(parents map[int64]int64, runID int64) int64 {
+ parent, exists := parents[runID]
+ if !exists || parent == runID {
+ return runID
+ }
+ root := findEpisodeParent(parents, parent)
+ parents[runID] = root
+ return root
+}
+
+func unionEpisodes(parents map[int64]int64, leftRunID, rightRunID int64) {
+ leftRoot := findEpisodeParent(parents, leftRunID)
+ rightRoot := findEpisodeParent(parents, rightRunID)
+ if leftRoot == rightRoot {
+ return
+ }
+ parents[leftRoot] = rightRoot
+}
+
+func compareEpisodeSeeds(left, right episodeSeed) int {
+ if left.Kind != right.Kind {
+ return cmp.Compare(seedKindRank(left.Kind), seedKindRank(right.Kind))
+ }
+ if left.Confidence != right.Confidence {
+ return cmp.Compare(seedConfidenceRank(left.Confidence), seedConfidenceRank(right.Confidence))
+ }
+ return cmp.Compare(left.EpisodeID, right.EpisodeID)
+}
+
+func seedKindRank(kind string) int {
+ switch kind {
+ case "workflow_call":
+ return 4
+ case "dispatch_workflow":
+ return 3
+ case "workflow_run":
+ return 2
+ default:
+ return 1
+ }
+}
+
+func seedConfidenceRank(confidence string) int {
+ switch confidence {
+ case "high":
+ return 3
+ case "medium":
+ return 2
+ default:
+ return 1
+ }
+}
+
+func classifyEpisode(run RunData) (string, string, string, []string) {
+ if run.AwContext != nil {
+ if run.AwContext.WorkflowCallID != "" {
+ return "dispatch:" + run.AwContext.WorkflowCallID, "dispatch_workflow", "high", []string{"context.workflow_call_id"}
+ }
+ if run.AwContext.RunID != "" && run.AwContext.WorkflowID != "" {
+ return fmt.Sprintf("dispatch:%s:%s:%s", run.AwContext.Repo, run.AwContext.RunID, run.AwContext.WorkflowID), "dispatch_workflow", "medium", []string{"context.run_id", "context.workflow_id"}
+ }
+ }
+ if run.Event == "workflow_run" {
+ return fmt.Sprintf("workflow_run:%d", run.DatabaseID), "workflow_run", "low", []string{"event=workflow_run", "upstream run metadata unavailable in logs summary"}
+ }
+ return fmt.Sprintf("standalone:%d", run.DatabaseID), "standalone", "high", []string{"no_shared_lineage_markers"}
+}
+
+func buildEpisodeEdge(run RunData, episodeID string, runsByID map[int64]RunData) (EpisodeEdge, bool) {
+ if run.AwContext == nil || run.AwContext.RunID == "" {
+ return EpisodeEdge{}, false
+ }
+ sourceRunID, err := strconv.ParseInt(run.AwContext.RunID, 10, 64)
+ if err != nil {
+ return EpisodeEdge{}, false
+ }
+ if _, ok := runsByID[sourceRunID]; !ok {
+ return EpisodeEdge{}, false
+ }
+ confidence := "medium"
+ reasons := []string{"context.run_id"}
+ if run.AwContext.WorkflowCallID != "" {
+ confidence = "high"
+ reasons = append(reasons, "context.workflow_call_id")
+ }
+ if run.AwContext.WorkflowID != "" {
+ reasons = append(reasons, "context.workflow_id")
+ }
+ return EpisodeEdge{
+ SourceRunID: sourceRunID,
+ TargetRunID: run.DatabaseID,
+ EdgeType: "dispatch_workflow",
+ Confidence: confidence,
+ Reasons: reasons,
+ SourceRepo: run.AwContext.Repo,
+ SourceRef: run.AwContext.WorkflowID,
+ EventType: run.AwContext.EventType,
+ EpisodeID: episodeID,
+ }, true
+}
diff --git a/pkg/cli/logs_json_test.go b/pkg/cli/logs_json_test.go
index 5f205797a0..fcfd9f7082 100644
--- a/pkg/cli/logs_json_test.go
+++ b/pkg/cli/logs_json_test.go
@@ -23,6 +23,7 @@ func TestBuildLogsData(t *testing.T) {
DatabaseID: 12345,
Number: 1,
WorkflowName: "Test Workflow",
+ WorkflowPath: ".github/workflows/test-workflow.yml",
Status: "completed",
Conclusion: "success",
Duration: 5 * time.Minute,
@@ -38,6 +39,24 @@ func TestBuildLogsData(t *testing.T) {
Event: "push",
HeadBranch: "main",
},
+ TaskDomain: &TaskDomainInfo{
+ Name: "triage",
+ Label: "Triage",
+ },
+ BehaviorFingerprint: &BehaviorFingerprint{
+ ExecutionStyle: "directed",
+ ToolBreadth: "narrow",
+ ActuationStyle: "read_only",
+ ResourceProfile: "lean",
+ DispatchMode: "standalone",
+ },
+ AgenticAssessments: []AgenticAssessment{
+ {
+ Kind: "overkill_for_agentic",
+ Severity: "low",
+ Summary: "Deterministic automation may be a better fit.",
+ },
+ },
MissingTools: []MissingToolReport{},
MCPFailures: []MCPFailureReport{},
},
@@ -46,6 +65,7 @@ func TestBuildLogsData(t *testing.T) {
DatabaseID: 12346,
Number: 2,
WorkflowName: "Test Workflow",
+ WorkflowPath: ".github/workflows/test-workflow.yml",
Status: "completed",
Conclusion: "failure",
Duration: 3 * time.Minute,
@@ -61,6 +81,17 @@ func TestBuildLogsData(t *testing.T) {
Event: "pull_request",
HeadBranch: "feature",
},
+ TaskDomain: &TaskDomainInfo{
+ Name: "triage",
+ Label: "Triage",
+ },
+ BehaviorFingerprint: &BehaviorFingerprint{
+ ExecutionStyle: "directed",
+ ToolBreadth: "narrow",
+ ActuationStyle: "read_only",
+ ResourceProfile: "lean",
+ DispatchMode: "standalone",
+ },
MissingTools: []MissingToolReport{
{
Tool: "github_search",
@@ -99,16 +130,52 @@ func TestBuildLogsData(t *testing.T) {
if logsData.Summary.TotalMissingTools != 1 {
t.Errorf("Expected TotalMissingTools to be 1, got %d", logsData.Summary.TotalMissingTools)
}
+ if logsData.Summary.TotalEpisodes != 2 {
+ t.Errorf("Expected TotalEpisodes to be 2, got %d", logsData.Summary.TotalEpisodes)
+ }
+ if logsData.Summary.HighConfidenceEpisodes != 2 {
+ t.Errorf("Expected HighConfidenceEpisodes to be 2, got %d", logsData.Summary.HighConfidenceEpisodes)
+ }
// Verify runs data
if len(logsData.Runs) != 2 {
t.Errorf("Expected 2 runs, got %d", len(logsData.Runs))
}
+ if len(logsData.Episodes) != 2 {
+ t.Fatalf("Expected 2 episodes, got %d", len(logsData.Episodes))
+ }
+ if len(logsData.Edges) != 0 {
+ t.Fatalf("Expected 0 edges for standalone runs, got %d", len(logsData.Edges))
+ }
// Verify first run
if logsData.Runs[0].DatabaseID != 12345 {
t.Errorf("Expected DatabaseID 12345, got %d", logsData.Runs[0].DatabaseID)
}
+ if logsData.Runs[0].TaskDomain == nil || logsData.Runs[0].TaskDomain.Name != "triage" {
+ t.Fatalf("Expected first run to include task domain, got %+v", logsData.Runs[0].TaskDomain)
+ }
+ if logsData.Runs[0].BehaviorFingerprint == nil || logsData.Runs[0].BehaviorFingerprint.ResourceProfile != "lean" {
+ t.Fatalf("Expected first run to include behavior fingerprint, got %+v", logsData.Runs[0].BehaviorFingerprint)
+ }
+ if len(logsData.Runs[0].AgenticAssessments) != 1 {
+ t.Fatalf("Expected first run to include 1 agentic assessment, got %d", len(logsData.Runs[0].AgenticAssessments))
+ }
+ if logsData.Runs[0].Comparison == nil {
+ t.Fatal("Expected first run to include comparison payload")
+ }
+ if logsData.Runs[0].Comparison.BaselineFound {
+ t.Fatal("Expected oldest run to have no baseline in logs comparison")
+ }
+ if logsData.Runs[1].Comparison == nil || !logsData.Runs[1].Comparison.BaselineFound {
+ t.Fatalf("Expected newer run to include a baseline comparison, got %+v", logsData.Runs[1].Comparison)
+ }
+ if logsData.Runs[1].Comparison.Baseline == nil || logsData.Runs[1].Comparison.Baseline.Selection != "cohort_match" {
+ t.Fatalf("Expected newer run to use cohort_match baseline, got %+v", logsData.Runs[1].Comparison.Baseline)
+ }
+ if logsData.Runs[1].Comparison.Baseline == nil || logsData.Runs[1].Comparison.Baseline.RunID != 12345 {
+ t.Fatalf("Expected newer run baseline to point to run 12345, got %+v", logsData.Runs[1].Comparison.Baseline)
+ }
// Duration format from formatDuration is "5.0m", not "5m0s"
if logsData.Runs[0].Duration == "" {
t.Errorf("Expected non-empty Duration, got empty string")
@@ -130,14 +197,16 @@ func TestRenderLogsJSON(t *testing.T) {
// Create sample logs data
logsData := LogsData{
Summary: LogsSummary{
- TotalRuns: 2,
- TotalDuration: "8m0s",
- TotalTokens: 1500,
- TotalCost: 0.075,
- TotalTurns: 5,
- TotalErrors: 1,
- TotalWarnings: 1,
- TotalMissingTools: 1,
+ TotalRuns: 2,
+ TotalDuration: "8m0s",
+ TotalTokens: 1500,
+ TotalCost: 0.075,
+ TotalTurns: 5,
+ TotalErrors: 1,
+ TotalWarnings: 1,
+ TotalMissingTools: 1,
+ TotalEpisodes: 1,
+ HighConfidenceEpisodes: 1,
},
Runs: []RunData{
{
@@ -157,8 +226,29 @@ func TestRenderLogsJSON(t *testing.T) {
LogsPath: filepath.Join(tmpDir, "run-12345"),
Event: "push",
Branch: "main",
+ Comparison: &AuditComparisonData{
+ BaselineFound: true,
+ Baseline: &AuditComparisonBaseline{
+ RunID: 12000,
+ Selection: "cohort_match",
+ MatchedOn: []string{"task_domain", "resource_profile"},
+ },
+ },
+ },
+ },
+ Episodes: []EpisodeData{
+ {
+ EpisodeID: "standalone:12345",
+ Kind: "standalone",
+ Confidence: "high",
+ RunIDs: []int64{12345},
+ WorkflowNames: []string{"Test Workflow"},
+ TotalRuns: 1,
+ TotalTokens: 1000,
+ TotalEstimatedCost: 0.05,
},
},
+ Edges: []EpisodeEdge{},
LogsLocation: tmpDir,
}
@@ -194,9 +284,98 @@ func TestRenderLogsJSON(t *testing.T) {
if parsedData.Summary.TotalTokens != 1500 {
t.Errorf("Expected TotalTokens 1500, got %d", parsedData.Summary.TotalTokens)
}
+ if parsedData.Summary.TotalEpisodes != 1 {
+ t.Errorf("Expected TotalEpisodes 1, got %d", parsedData.Summary.TotalEpisodes)
+ }
if len(parsedData.Runs) != 1 {
t.Errorf("Expected 1 run in JSON, got %d", len(parsedData.Runs))
}
+ if parsedData.Runs[0].Comparison == nil || parsedData.Runs[0].Comparison.Baseline == nil || parsedData.Runs[0].Comparison.Baseline.Selection != "cohort_match" {
+ t.Fatalf("Expected comparison metadata to survive JSON round-trip, got %+v", parsedData.Runs[0].Comparison)
+ }
+}
+
+func TestBuildLogsDataAggregatesDispatchEpisode(t *testing.T) {
+ tmpDir := testutil.TempDir(t, "test-episode-*")
+ processedRuns := []ProcessedRun{
+ {
+ Run: WorkflowRun{
+ DatabaseID: 2001,
+ WorkflowName: "orchestrator",
+ WorkflowPath: ".github/workflows/orchestrator.yml",
+ Status: "completed",
+ Conclusion: "success",
+ Duration: 2 * time.Minute,
+ TokenUsage: 300,
+ EstimatedCost: 0.01,
+ CreatedAt: time.Date(2024, 2, 1, 12, 0, 0, 0, time.UTC),
+ StartedAt: time.Date(2024, 2, 1, 12, 0, 0, 0, time.UTC),
+ UpdatedAt: time.Date(2024, 2, 1, 12, 2, 0, 0, time.UTC),
+ LogsPath: filepath.Join(tmpDir, "run-2001"),
+ },
+ },
+ {
+ Run: WorkflowRun{
+ DatabaseID: 2002,
+ WorkflowName: "worker",
+ WorkflowPath: ".github/workflows/worker.yml",
+ Status: "completed",
+ Conclusion: "success",
+ Duration: 4 * time.Minute,
+ TokenUsage: 700,
+ EstimatedCost: 0.03,
+ MissingToolCount: 1,
+ CreatedAt: time.Date(2024, 2, 1, 12, 3, 0, 0, time.UTC),
+ StartedAt: time.Date(2024, 2, 1, 12, 3, 0, 0, time.UTC),
+ UpdatedAt: time.Date(2024, 2, 1, 12, 7, 0, 0, time.UTC),
+ LogsPath: filepath.Join(tmpDir, "run-2002"),
+ },
+ AwContext: &AwContext{
+ Repo: "github/gh-aw",
+ RunID: "2001",
+ WorkflowID: "github/gh-aw/.github/workflows/orchestrator.yml@refs/heads/main",
+ WorkflowCallID: "2001-1",
+ EventType: "workflow_dispatch",
+ },
+ BehaviorFingerprint: &BehaviorFingerprint{ActuationStyle: "selective_write"},
+ MCPFailures: []MCPFailureReport{{ServerName: "github", Status: "failed"}},
+ },
+ }
+
+ logsData := buildLogsData(processedRuns, tmpDir, nil)
+
+ if logsData.Summary.TotalEpisodes != 1 {
+ t.Fatalf("Expected 1 episode, got %d", logsData.Summary.TotalEpisodes)
+ }
+ if logsData.Summary.HighConfidenceEpisodes != 1 {
+ t.Fatalf("Expected 1 high-confidence episode, got %d", logsData.Summary.HighConfidenceEpisodes)
+ }
+ if len(logsData.Edges) != 1 {
+ t.Fatalf("Expected 1 edge, got %d", len(logsData.Edges))
+ }
+ edge := logsData.Edges[0]
+ if edge.SourceRunID != 2001 || edge.TargetRunID != 2002 {
+ t.Fatalf("Expected edge 2001->2002, got %d->%d", edge.SourceRunID, edge.TargetRunID)
+ }
+ if edge.EdgeType != "dispatch_workflow" {
+ t.Fatalf("Expected dispatch_workflow edge, got %s", edge.EdgeType)
+ }
+ episode := logsData.Episodes[0]
+ if episode.Kind != "dispatch_workflow" {
+ t.Fatalf("Expected dispatch_workflow episode, got %s", episode.Kind)
+ }
+ if episode.TotalRuns != 2 {
+ t.Fatalf("Expected episode TotalRuns 2, got %d", episode.TotalRuns)
+ }
+ if episode.TotalTokens != 1000 {
+ t.Fatalf("Expected episode TotalTokens 1000, got %d", episode.TotalTokens)
+ }
+ if episode.MCPFailureCount != 1 {
+ t.Fatalf("Expected episode MCPFailureCount 1, got %d", episode.MCPFailureCount)
+ }
+ if episode.WriteCapableNodeCount != 1 {
+ t.Fatalf("Expected episode WriteCapableNodeCount 1, got %d", episode.WriteCapableNodeCount)
+ }
}
// TestBuildMissingToolsSummary tests missing tools aggregation
diff --git a/pkg/cli/logs_models.go b/pkg/cli/logs_models.go
index 16fae3eeb4..3e96fc05cd 100644
--- a/pkg/cli/logs_models.go
+++ b/pkg/cli/logs_models.go
@@ -69,6 +69,10 @@ type LogMetrics = workflow.LogMetrics
// ProcessedRun represents a workflow run with its associated analysis
type ProcessedRun struct {
Run WorkflowRun
+ AwContext *AwContext
+ TaskDomain *TaskDomainInfo
+ BehaviorFingerprint *BehaviorFingerprint
+ AgenticAssessments []AgenticAssessment
AccessAnalysis *DomainAnalysis
FirewallAnalysis *FirewallAnalysis
PolicyAnalysis *PolicyAnalysis
@@ -175,28 +179,36 @@ var ErrNoArtifacts = errors.New("no artifacts found for this run")
// - If the CLI version in the summary doesn't match the current version, the run is reprocessed
// - This ensures that bug fixes and improvements in log parsing are automatically applied
type RunSummary struct {
- CLIVersion string `json:"cli_version"` // CLI version used to process this run
- RunID int64 `json:"run_id"` // Workflow run database ID
- ProcessedAt time.Time `json:"processed_at"` // When this summary was created
- Run WorkflowRun `json:"run"` // Full workflow run metadata
- Metrics LogMetrics `json:"metrics"` // Extracted log metrics
- AccessAnalysis *DomainAnalysis `json:"access_analysis"` // Network access analysis
- FirewallAnalysis *FirewallAnalysis `json:"firewall_analysis"` // Firewall log analysis
- PolicyAnalysis *PolicyAnalysis `json:"policy_analysis,omitempty"` // Firewall policy rule attribution
- RedactedDomainsAnalysis *RedactedDomainsAnalysis `json:"redacted_domains_analysis"` // Redacted URL domains analysis
- MissingTools []MissingToolReport `json:"missing_tools"` // Missing tool reports
- MissingData []MissingDataReport `json:"missing_data"` // Missing data reports
- Noops []NoopReport `json:"noops"` // Noop messages
- MCPFailures []MCPFailureReport `json:"mcp_failures"` // MCP server failures
- MCPToolUsage *MCPToolUsageData `json:"mcp_tool_usage,omitempty"` // MCP tool usage data
- ArtifactsList []string `json:"artifacts_list"` // List of downloaded artifact files
- JobDetails []JobInfoWithDuration `json:"job_details"` // Job execution details
+ CLIVersion string `json:"cli_version"` // CLI version used to process this run
+ RunID int64 `json:"run_id"` // Workflow run database ID
+ ProcessedAt time.Time `json:"processed_at"` // When this summary was created
+ Run WorkflowRun `json:"run"` // Full workflow run metadata
+ Metrics LogMetrics `json:"metrics"` // Extracted log metrics
+ AwContext *AwContext `json:"context,omitempty"` // aw_context data from aw_info.json
+ TaskDomain *TaskDomainInfo `json:"task_domain,omitempty"` // Inferred workflow task domain
+ BehaviorFingerprint *BehaviorFingerprint `json:"behavior_fingerprint,omitempty"` // Compact execution profile
+ AgenticAssessments []AgenticAssessment `json:"agentic_assessments,omitempty"` // Derived agentic judgments
+ AccessAnalysis *DomainAnalysis `json:"access_analysis"` // Network access analysis
+ FirewallAnalysis *FirewallAnalysis `json:"firewall_analysis"` // Firewall log analysis
+ PolicyAnalysis *PolicyAnalysis `json:"policy_analysis,omitempty"` // Firewall policy rule attribution
+ RedactedDomainsAnalysis *RedactedDomainsAnalysis `json:"redacted_domains_analysis"` // Redacted URL domains analysis
+ MissingTools []MissingToolReport `json:"missing_tools"` // Missing tool reports
+ MissingData []MissingDataReport `json:"missing_data"` // Missing data reports
+ Noops []NoopReport `json:"noops"` // Noop messages
+ MCPFailures []MCPFailureReport `json:"mcp_failures"` // MCP server failures
+ MCPToolUsage *MCPToolUsageData `json:"mcp_tool_usage,omitempty"` // MCP tool usage data
+ ArtifactsList []string `json:"artifacts_list"` // List of downloaded artifact files
+ JobDetails []JobInfoWithDuration `json:"job_details"` // Job execution details
}
// DownloadResult represents the result of downloading and processing a workflow run
type DownloadResult struct {
Run WorkflowRun
Metrics LogMetrics
+ AwContext *AwContext
+ TaskDomain *TaskDomainInfo
+ BehaviorFingerprint *BehaviorFingerprint
+ AgenticAssessments []AgenticAssessment
AccessAnalysis *DomainAnalysis
FirewallAnalysis *FirewallAnalysis
RedactedDomainsAnalysis *RedactedDomainsAnalysis
diff --git a/pkg/cli/logs_orchestrator.go b/pkg/cli/logs_orchestrator.go
index b1893a08e8..39703d8123 100644
--- a/pkg/cli/logs_orchestrator.go
+++ b/pkg/cli/logs_orchestrator.go
@@ -350,6 +350,10 @@ func DownloadWorkflowLogs(ctx context.Context, workflowName string, count int, s
processedRun := ProcessedRun{
Run: run,
+ AwContext: result.AwContext,
+ TaskDomain: result.TaskDomain,
+ BehaviorFingerprint: result.BehaviorFingerprint,
+ AgenticAssessments: result.AgenticAssessments,
AccessAnalysis: result.AccessAnalysis,
FirewallAnalysis: result.FirewallAnalysis,
RedactedDomainsAnalysis: result.RedactedDomainsAnalysis,
@@ -609,6 +613,10 @@ func downloadRunArtifactsConcurrent(ctx context.Context, runs []WorkflowRun, out
result := DownloadResult{
Run: summary.Run,
Metrics: summary.Metrics,
+ AwContext: summary.AwContext,
+ TaskDomain: summary.TaskDomain,
+ BehaviorFingerprint: summary.BehaviorFingerprint,
+ AgenticAssessments: summary.AgenticAssessments,
AccessAnalysis: summary.AccessAnalysis,
FirewallAnalysis: summary.FirewallAnalysis,
RedactedDomainsAnalysis: summary.RedactedDomainsAnalysis,
@@ -762,6 +770,24 @@ func downloadRunArtifactsConcurrent(ctx context.Context, runs []WorkflowRun, out
}
}
+ processedRun := ProcessedRun{
+ Run: result.Run,
+ AccessAnalysis: accessAnalysis,
+ FirewallAnalysis: firewallAnalysis,
+ RedactedDomainsAnalysis: redactedDomainsAnalysis,
+ MissingTools: missingTools,
+ MissingData: missingData,
+ Noops: noops,
+ MCPFailures: mcpFailures,
+ MCPToolUsage: mcpToolUsage,
+ JobDetails: jobDetails,
+ }
+ awContext, _, _, taskDomain, behaviorFingerprint, agenticAssessments := deriveRunAgenticAnalysis(processedRun, metrics)
+ result.AwContext = awContext
+ result.TaskDomain = taskDomain
+ result.BehaviorFingerprint = behaviorFingerprint
+ result.AgenticAssessments = agenticAssessments
+
// Create and save run summary
summary := &RunSummary{
CLIVersion: GetVersion(),
@@ -769,6 +795,10 @@ func downloadRunArtifactsConcurrent(ctx context.Context, runs []WorkflowRun, out
ProcessedAt: time.Now(),
Run: result.Run,
Metrics: metrics,
+ AwContext: result.AwContext,
+ TaskDomain: result.TaskDomain,
+ BehaviorFingerprint: result.BehaviorFingerprint,
+ AgenticAssessments: result.AgenticAssessments,
AccessAnalysis: accessAnalysis,
FirewallAnalysis: firewallAnalysis,
RedactedDomainsAnalysis: redactedDomainsAnalysis,
diff --git a/pkg/cli/logs_report.go b/pkg/cli/logs_report.go
index 05984b16b7..ef6bbde0f3 100644
--- a/pkg/cli/logs_report.go
+++ b/pkg/cli/logs_report.go
@@ -22,8 +22,11 @@ var reportLog = logger.New("cli:logs_report")
type LogsData struct {
Summary LogsSummary `json:"summary" console:"title:Workflow Logs Summary"`
Runs []RunData `json:"runs" console:"title:Workflow Logs Overview"`
+ Episodes []EpisodeData `json:"episodes" console:"-"`
+ Edges []EpisodeEdge `json:"edges" console:"-"`
ToolUsage []ToolUsageSummary `json:"tool_usage,omitempty" console:"title:🛠️ Tool Usage Summary,omitempty"`
MCPToolUsage *MCPToolUsageSummary `json:"mcp_tool_usage,omitempty" console:"title:🔧 MCP Tool Usage,omitempty"`
+ Observability []ObservabilityInsight `json:"observability_insights,omitempty" console:"-"`
ErrorsAndWarnings []ErrorSummary `json:"errors_and_warnings,omitempty" console:"title:Errors and Warnings,omitempty"`
MissingTools []MissingToolSummary `json:"missing_tools,omitempty" console:"title:🛠️ Missing Tools Summary,omitempty"`
MissingData []MissingDataSummary `json:"missing_data,omitempty" console:"title:📊 Missing Data Summary,omitempty"`
@@ -51,44 +54,50 @@ type ContinuationData struct {
// LogsSummary contains aggregate metrics across all runs
type LogsSummary struct {
- TotalRuns int `json:"total_runs" console:"header:Total Runs"`
- TotalDuration string `json:"total_duration" console:"header:Total Duration"`
- TotalTokens int `json:"total_tokens" console:"header:Total Tokens,format:number"`
- TotalCost float64 `json:"total_cost" console:"header:Total Cost,format:cost"`
- TotalTurns int `json:"total_turns" console:"header:Total Turns"`
- TotalErrors int `json:"total_errors" console:"header:Total Errors"`
- TotalWarnings int `json:"total_warnings" console:"header:Total Warnings"`
- TotalMissingTools int `json:"total_missing_tools" console:"header:Total Missing Tools"`
- TotalMissingData int `json:"total_missing_data" console:"header:Total Missing Data"`
- TotalSafeItems int `json:"total_safe_items" console:"header:Total Safe Items"`
+ TotalRuns int `json:"total_runs" console:"header:Total Runs"`
+ TotalDuration string `json:"total_duration" console:"header:Total Duration"`
+ TotalTokens int `json:"total_tokens" console:"header:Total Tokens,format:number"`
+ TotalCost float64 `json:"total_cost" console:"header:Total Cost,format:cost"`
+ TotalTurns int `json:"total_turns" console:"header:Total Turns"`
+ TotalErrors int `json:"total_errors" console:"header:Total Errors"`
+ TotalWarnings int `json:"total_warnings" console:"header:Total Warnings"`
+ TotalMissingTools int `json:"total_missing_tools" console:"header:Total Missing Tools"`
+ TotalMissingData int `json:"total_missing_data" console:"header:Total Missing Data"`
+ TotalSafeItems int `json:"total_safe_items" console:"header:Total Safe Items"`
+ TotalEpisodes int `json:"total_episodes" console:"header:Total Episodes"`
+ HighConfidenceEpisodes int `json:"high_confidence_episodes" console:"header:High Confidence Episodes"`
}
// RunData contains information about a single workflow run
type RunData struct {
- DatabaseID int64 `json:"database_id" console:"header:Run ID"`
- Number int `json:"number" console:"-"`
- WorkflowName string `json:"workflow_name" console:"header:Workflow"`
- WorkflowPath string `json:"workflow_path" console:"-"`
- Agent string `json:"agent,omitempty" console:"header:Agent,omitempty"`
- Status string `json:"status" console:"header:Status"`
- Conclusion string `json:"conclusion,omitempty" console:"-"`
- Duration string `json:"duration,omitempty" console:"header:Duration,omitempty"`
- TokenUsage int `json:"token_usage,omitempty" console:"header:Tokens,format:number,omitempty"`
- EstimatedCost float64 `json:"estimated_cost,omitempty" console:"header:Cost ($),format:cost,omitempty"`
- Turns int `json:"turns,omitempty" console:"header:Turns,omitempty"`
- ErrorCount int `json:"error_count" console:"header:Errors"`
- WarningCount int `json:"warning_count" console:"header:Warnings"`
- MissingToolCount int `json:"missing_tool_count" console:"header:Missing Tools"`
- MissingDataCount int `json:"missing_data_count" console:"header:Missing Data"`
- SafeItemsCount int `json:"safe_items_count,omitempty" console:"header:Safe Items,omitempty"`
- CreatedAt time.Time `json:"created_at" console:"header:Created"`
- StartedAt time.Time `json:"started_at,omitzero" console:"-"`
- UpdatedAt time.Time `json:"updated_at,omitzero" console:"-"`
- URL string `json:"url" console:"-"`
- LogsPath string `json:"logs_path" console:"header:Logs Path"`
- Event string `json:"event" console:"-"`
- Branch string `json:"branch" console:"-"`
- AwContext *AwContext `json:"context,omitempty" console:"-"` // aw_context data from aw_info.json
+ DatabaseID int64 `json:"database_id" console:"header:Run ID"`
+ Number int `json:"number" console:"-"`
+ WorkflowName string `json:"workflow_name" console:"header:Workflow"`
+ WorkflowPath string `json:"workflow_path" console:"-"`
+ Agent string `json:"agent,omitempty" console:"header:Agent,omitempty"`
+ Status string `json:"status" console:"header:Status"`
+ Conclusion string `json:"conclusion,omitempty" console:"-"`
+ Duration string `json:"duration,omitempty" console:"header:Duration,omitempty"`
+ TokenUsage int `json:"token_usage,omitempty" console:"header:Tokens,format:number,omitempty"`
+ EstimatedCost float64 `json:"estimated_cost,omitempty" console:"header:Cost ($),format:cost,omitempty"`
+ Turns int `json:"turns,omitempty" console:"header:Turns,omitempty"`
+ ErrorCount int `json:"error_count" console:"header:Errors"`
+ WarningCount int `json:"warning_count" console:"header:Warnings"`
+ MissingToolCount int `json:"missing_tool_count" console:"header:Missing Tools"`
+ MissingDataCount int `json:"missing_data_count" console:"header:Missing Data"`
+ SafeItemsCount int `json:"safe_items_count,omitempty" console:"header:Safe Items,omitempty"`
+ CreatedAt time.Time `json:"created_at" console:"header:Created"`
+ StartedAt time.Time `json:"started_at,omitzero" console:"-"`
+ UpdatedAt time.Time `json:"updated_at,omitzero" console:"-"`
+ URL string `json:"url" console:"-"`
+ LogsPath string `json:"logs_path" console:"header:Logs Path"`
+ Event string `json:"event" console:"-"`
+ Branch string `json:"branch" console:"-"`
+ Comparison *AuditComparisonData `json:"comparison,omitempty" console:"-"`
+ TaskDomain *TaskDomainInfo `json:"task_domain,omitempty" console:"-"`
+ BehaviorFingerprint *BehaviorFingerprint `json:"behavior_fingerprint,omitempty" console:"-"`
+ AgenticAssessments []AgenticAssessment `json:"agentic_assessments,omitempty" console:"-"`
+ AwContext *AwContext `json:"context,omitempty" console:"-"` // aw_context data from aw_info.json
}
// ToolUsageSummary contains aggregated tool usage statistics
@@ -166,7 +175,7 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation
totalMissingData += run.MissingDataCount
totalSafeItems += run.SafeItemsCount
- // Extract agent/engine ID and aw_context from aw_info.json
+ // Extract agent/engine ID and aw_context from aw_info.json.
agentID := ""
var awContext *AwContext
awInfoPath := filepath.Join(run.LogsPath, "aw_info.json")
@@ -174,31 +183,40 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation
agentID = info.EngineID
awContext = info.Context
}
+ if awContext == nil {
+ awContext = pr.AwContext
+ }
+
+ comparison := buildAuditComparisonForProcessedRuns(pr, processedRuns)
runData := RunData{
- DatabaseID: run.DatabaseID,
- Number: run.Number,
- WorkflowName: run.WorkflowName,
- WorkflowPath: run.WorkflowPath,
- Agent: agentID,
- Status: run.Status,
- Conclusion: run.Conclusion,
- TokenUsage: run.TokenUsage,
- EstimatedCost: run.EstimatedCost,
- Turns: run.Turns,
- ErrorCount: run.ErrorCount,
- WarningCount: run.WarningCount,
- MissingToolCount: run.MissingToolCount,
- MissingDataCount: run.MissingDataCount,
- SafeItemsCount: run.SafeItemsCount,
- CreatedAt: run.CreatedAt,
- StartedAt: run.StartedAt,
- UpdatedAt: run.UpdatedAt,
- URL: run.URL,
- LogsPath: run.LogsPath,
- Event: run.Event,
- Branch: run.HeadBranch,
- AwContext: awContext,
+ DatabaseID: run.DatabaseID,
+ Number: run.Number,
+ WorkflowName: run.WorkflowName,
+ WorkflowPath: run.WorkflowPath,
+ Agent: agentID,
+ Status: run.Status,
+ Conclusion: run.Conclusion,
+ TokenUsage: run.TokenUsage,
+ EstimatedCost: run.EstimatedCost,
+ Turns: run.Turns,
+ ErrorCount: run.ErrorCount,
+ WarningCount: run.WarningCount,
+ MissingToolCount: run.MissingToolCount,
+ MissingDataCount: run.MissingDataCount,
+ SafeItemsCount: run.SafeItemsCount,
+ CreatedAt: run.CreatedAt,
+ StartedAt: run.StartedAt,
+ UpdatedAt: run.UpdatedAt,
+ URL: run.URL,
+ LogsPath: run.LogsPath,
+ Event: run.Event,
+ Branch: run.HeadBranch,
+ Comparison: comparison,
+ TaskDomain: pr.TaskDomain,
+ BehaviorFingerprint: pr.BehaviorFingerprint,
+ AgenticAssessments: pr.AgenticAssessments,
+ AwContext: awContext,
}
if run.Duration > 0 {
runData.Duration = timeutil.FormatDuration(run.Duration)
@@ -219,6 +237,14 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation
TotalSafeItems: totalSafeItems,
}
+ episodes, edges := buildEpisodeData(runs, processedRuns)
+ for _, episode := range episodes {
+ summary.TotalEpisodes++
+ if episode.Confidence == "high" {
+ summary.HighConfidenceEpisodes++
+ }
+ }
+
// Build tool usage summary
toolUsage := buildToolUsageSummary(processedRuns)
@@ -246,13 +272,18 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation
// Build redacted domains summary
redactedDomains := buildRedactedDomainsSummary(processedRuns)
+ observability := buildLogsObservabilityInsights(processedRuns, toolUsage)
+
absOutputDir, _ := filepath.Abs(outputDir)
return LogsData{
Summary: summary,
Runs: runs,
+ Episodes: episodes,
+ Edges: edges,
ToolUsage: toolUsage,
MCPToolUsage: mcpToolUsage,
+ Observability: observability,
ErrorsAndWarnings: errorsAndWarnings,
MissingTools: missingTools,
MissingData: missingData,
@@ -942,4 +973,11 @@ func renderLogsConsole(data LogsData) {
console.FormatInfoMessage("•"),
len(data.ToolUsage))
}
+
+ if len(data.Observability) > 0 {
+ fmt.Fprintln(os.Stderr)
+ fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Observability Insights"))
+ fmt.Fprintln(os.Stderr)
+ renderObservabilityInsights(data.Observability)
+ }
}
diff --git a/pkg/cli/logs_summary_test.go b/pkg/cli/logs_summary_test.go
index e1f4df9fc4..4e409012ae 100644
--- a/pkg/cli/logs_summary_test.go
+++ b/pkg/cli/logs_summary_test.go
@@ -44,6 +44,24 @@ func TestSaveAndLoadRunSummary(t *testing.T) {
EstimatedCost: 0.05,
Turns: 5,
},
+ TaskDomain: &TaskDomainInfo{
+ Name: "research",
+ Label: "Research",
+ },
+ BehaviorFingerprint: &BehaviorFingerprint{
+ ExecutionStyle: "adaptive",
+ ToolBreadth: "moderate",
+ ActuationStyle: "selective_write",
+ ResourceProfile: "moderate",
+ DispatchMode: "delegated",
+ },
+ AgenticAssessments: []AgenticAssessment{
+ {
+ Kind: "delegated_context_present",
+ Severity: "info",
+ Summary: "The run preserved upstream dispatch context.",
+ },
+ },
MissingTools: []MissingToolReport{
{
Tool: "test_tool",
@@ -86,6 +104,15 @@ func TestSaveAndLoadRunSummary(t *testing.T) {
if loadedSummary.Metrics.TokenUsage != testSummary.Metrics.TokenUsage {
t.Errorf("Metrics.TokenUsage mismatch: got %d, want %d", loadedSummary.Metrics.TokenUsage, testSummary.Metrics.TokenUsage)
}
+ if loadedSummary.TaskDomain == nil || loadedSummary.TaskDomain.Name != testSummary.TaskDomain.Name {
+ t.Fatalf("TaskDomain mismatch: got %+v, want %+v", loadedSummary.TaskDomain, testSummary.TaskDomain)
+ }
+ if loadedSummary.BehaviorFingerprint == nil || loadedSummary.BehaviorFingerprint.DispatchMode != testSummary.BehaviorFingerprint.DispatchMode {
+ t.Fatalf("BehaviorFingerprint mismatch: got %+v, want %+v", loadedSummary.BehaviorFingerprint, testSummary.BehaviorFingerprint)
+ }
+ if len(loadedSummary.AgenticAssessments) != len(testSummary.AgenticAssessments) {
+ t.Fatalf("AgenticAssessments length mismatch: got %d, want %d", len(loadedSummary.AgenticAssessments), len(testSummary.AgenticAssessments))
+ }
if len(loadedSummary.MissingTools) != len(testSummary.MissingTools) {
t.Errorf("MissingTools length mismatch: got %d, want %d", len(loadedSummary.MissingTools), len(testSummary.MissingTools))
}
diff --git a/pkg/cli/observability_insights.go b/pkg/cli/observability_insights.go
new file mode 100644
index 0000000000..b2cb45cee2
--- /dev/null
+++ b/pkg/cli/observability_insights.go
@@ -0,0 +1,337 @@
+package cli
+
+import (
+ "fmt"
+ "os"
+ "strings"
+)
+
+type ObservabilityInsight struct {
+ Category string `json:"category"`
+ Severity string `json:"severity"`
+ Title string `json:"title"`
+ Summary string `json:"summary"`
+ Evidence string `json:"evidence,omitempty"`
+}
+
+type workflowObservabilityStats struct {
+ workflowName string
+ runs int
+ failures int
+ timedOuts int
+ missingTools int
+ mcpFailures int
+ missingData int
+ safeItems int
+ totalTurns int
+ minTurns int
+ maxTurns int
+ blocked int
+ totalNet int
+}
+
+func buildAuditObservabilityInsights(processedRun ProcessedRun, metrics MetricsData, toolUsage []ToolUsageInfo, createdItems []CreatedItemReport) []ObservabilityInsight {
+ insights := make([]ObservabilityInsight, 0, 5)
+ toolTypes := len(toolUsage)
+
+ switch {
+ case metrics.Turns >= 12 || toolTypes >= 6:
+ insights = append(insights, ObservabilityInsight{
+ Category: "execution",
+ Severity: "medium",
+ Title: "Exploratory execution path",
+ Summary: fmt.Sprintf("The agent used %d turns across %d tool types, which indicates adaptive planning instead of a strictly linear path.", metrics.Turns, toolTypes),
+ Evidence: fmt.Sprintf("turns=%d tool_types=%d", metrics.Turns, toolTypes),
+ })
+ case metrics.Turns >= 6 || toolTypes >= 4:
+ insights = append(insights, ObservabilityInsight{
+ Category: "execution",
+ Severity: "info",
+ Title: "Adaptive execution path",
+ Summary: fmt.Sprintf("The run stayed moderately dynamic with %d turns and %d tool types.", metrics.Turns, toolTypes),
+ Evidence: fmt.Sprintf("turns=%d tool_types=%d", metrics.Turns, toolTypes),
+ })
+ default:
+ insights = append(insights, ObservabilityInsight{
+ Category: "execution",
+ Severity: "info",
+ Title: "Directed execution path",
+ Summary: fmt.Sprintf("The run remained relatively linear with %d turns and %d tool types.", metrics.Turns, toolTypes),
+ Evidence: fmt.Sprintf("turns=%d tool_types=%d", metrics.Turns, toolTypes),
+ })
+ }
+
+ createdCount := len(createdItems)
+ safeItemsCount := processedRun.Run.SafeItemsCount
+ if createdCount > 0 || safeItemsCount > 0 {
+ insights = append(insights, ObservabilityInsight{
+ Category: "actuation",
+ Severity: "info",
+ Title: "Write path executed",
+ Summary: fmt.Sprintf("The workflow crossed from analysis into action, producing %d created item(s) and %d safe output action(s).", createdCount, safeItemsCount),
+ Evidence: fmt.Sprintf("created_items=%d safe_items=%d", createdCount, safeItemsCount),
+ })
+ } else {
+ insights = append(insights, ObservabilityInsight{
+ Category: "actuation",
+ Severity: "info",
+ Title: "Read-only posture observed",
+ Summary: "The workflow stayed in an analysis posture and did not emit any GitHub write actions.",
+ Evidence: "created_items=0 safe_items=0",
+ })
+ }
+
+ frictionEvents := len(processedRun.MissingTools) + len(processedRun.MCPFailures) + len(processedRun.MissingData)
+ if frictionEvents > 0 {
+ severity := "medium"
+ if len(processedRun.MCPFailures) > 0 || frictionEvents >= 3 {
+ severity = "high"
+ }
+ insights = append(insights, ObservabilityInsight{
+ Category: "tooling",
+ Severity: severity,
+ Title: "Capability friction detected",
+ Summary: fmt.Sprintf("The run hit %d capability gap event(s): %d missing tool(s), %d MCP failure(s), and %d missing data signal(s).", frictionEvents, len(processedRun.MissingTools), len(processedRun.MCPFailures), len(processedRun.MissingData)),
+ Evidence: fmt.Sprintf("missing_tools=%d mcp_failures=%d missing_data=%d", len(processedRun.MissingTools), len(processedRun.MCPFailures), len(processedRun.MissingData)),
+ })
+ }
+
+ if processedRun.FirewallAnalysis != nil && processedRun.FirewallAnalysis.TotalRequests > 0 {
+ blockedRate := float64(processedRun.FirewallAnalysis.BlockedRequests) / float64(processedRun.FirewallAnalysis.TotalRequests)
+ severity := "info"
+ title := "Network policy aligned"
+ summary := fmt.Sprintf("The firewall observed %d request(s) with %d blocked, for a %.0f%% block rate.", processedRun.FirewallAnalysis.TotalRequests, processedRun.FirewallAnalysis.BlockedRequests, blockedRate*100)
+ if processedRun.FirewallAnalysis.BlockedRequests > 0 {
+ title = "Network friction detected"
+ severity = "medium"
+ if blockedRate >= 0.5 || processedRun.FirewallAnalysis.BlockedRequests >= 10 {
+ severity = "high"
+ }
+ }
+ insights = append(insights, ObservabilityInsight{
+ Category: "network",
+ Severity: severity,
+ Title: title,
+ Summary: summary,
+ Evidence: fmt.Sprintf("blocked=%d total=%d", processedRun.FirewallAnalysis.BlockedRequests, processedRun.FirewallAnalysis.TotalRequests),
+ })
+ }
+
+ if processedRun.RedactedDomainsAnalysis != nil && processedRun.RedactedDomainsAnalysis.TotalDomains > 0 {
+ insights = append(insights, ObservabilityInsight{
+ Category: "privacy",
+ Severity: "info",
+ Title: "Sensitive destinations were redacted",
+ Summary: fmt.Sprintf("Observability data preserved privacy boundaries by redacting %d domain(s) from emitted logs.", processedRun.RedactedDomainsAnalysis.TotalDomains),
+ Evidence: fmt.Sprintf("redacted_domains=%d", processedRun.RedactedDomainsAnalysis.TotalDomains),
+ })
+ }
+
+ return insights
+}
+
+func buildLogsObservabilityInsights(processedRuns []ProcessedRun, toolUsage []ToolUsageSummary) []ObservabilityInsight {
+ if len(processedRuns) == 0 {
+ return nil
+ }
+
+ insights := make([]ObservabilityInsight, 0, 6)
+ workflowStats := make(map[string]*workflowObservabilityStats)
+ writeRuns := 0
+ readOnlyRuns := 0
+
+ for _, pr := range processedRuns {
+ stats, exists := workflowStats[pr.Run.WorkflowName]
+ if !exists {
+ stats = &workflowObservabilityStats{
+ workflowName: pr.Run.WorkflowName,
+ minTurns: pr.Run.Turns,
+ maxTurns: pr.Run.Turns,
+ }
+ workflowStats[pr.Run.WorkflowName] = stats
+ }
+
+ stats.runs++
+ stats.totalTurns += pr.Run.Turns
+ if stats.runs == 1 || pr.Run.Turns < stats.minTurns {
+ stats.minTurns = pr.Run.Turns
+ }
+ if pr.Run.Turns > stats.maxTurns {
+ stats.maxTurns = pr.Run.Turns
+ }
+ if pr.Run.Conclusion == "failure" {
+ stats.failures++
+ }
+ if pr.Run.Conclusion == "timed_out" {
+ stats.timedOuts++
+ }
+ stats.missingTools += len(pr.MissingTools)
+ stats.mcpFailures += len(pr.MCPFailures)
+ stats.missingData += len(pr.MissingData)
+ stats.safeItems += pr.Run.SafeItemsCount
+ if pr.Run.SafeItemsCount > 0 {
+ writeRuns++
+ } else {
+ readOnlyRuns++
+ }
+ if pr.FirewallAnalysis != nil {
+ stats.blocked += pr.FirewallAnalysis.BlockedRequests
+ stats.totalNet += pr.FirewallAnalysis.TotalRequests
+ }
+ }
+
+ var failureHotspot *workflowObservabilityStats
+ for _, stats := range workflowStats {
+ if stats.failures == 0 {
+ continue
+ }
+ if failureHotspot == nil || stats.failures > failureHotspot.failures || (stats.failures == failureHotspot.failures && stats.workflowName < failureHotspot.workflowName) {
+ failureHotspot = stats
+ }
+ }
+ if failureHotspot != nil {
+ failureRate := float64(failureHotspot.failures) / float64(failureHotspot.runs)
+ severity := "medium"
+ if failureRate >= 0.5 {
+ severity = "high"
+ }
+ insights = append(insights, ObservabilityInsight{
+ Category: "reliability",
+ Severity: severity,
+ Title: "Failure hotspot identified",
+ Summary: fmt.Sprintf("Workflow %q accounted for %d failure(s) across %d run(s), a %.0f%% failure rate.", failureHotspot.workflowName, failureHotspot.failures, failureHotspot.runs, failureRate*100),
+ Evidence: fmt.Sprintf("workflow=%s failures=%d runs=%d", failureHotspot.workflowName, failureHotspot.failures, failureHotspot.runs),
+ })
+ }
+
+ var driftHotspot *workflowObservabilityStats
+ for _, stats := range workflowStats {
+ if stats.runs < 2 {
+ continue
+ }
+ if stats.maxTurns-stats.minTurns < 4 {
+ continue
+ }
+ if driftHotspot == nil || (stats.maxTurns-stats.minTurns) > (driftHotspot.maxTurns-driftHotspot.minTurns) {
+ driftHotspot = stats
+ }
+ }
+ if driftHotspot != nil {
+ avgTurns := float64(driftHotspot.totalTurns) / float64(driftHotspot.runs)
+ insights = append(insights, ObservabilityInsight{
+ Category: "drift",
+ Severity: "medium",
+ Title: "Execution drift observed",
+ Summary: fmt.Sprintf("Workflow %q varied from %d to %d turns across runs, which suggests changing task shape or unstable prompts (avg %.1f turns).", driftHotspot.workflowName, driftHotspot.minTurns, driftHotspot.maxTurns, avgTurns),
+ Evidence: fmt.Sprintf("workflow=%s min_turns=%d max_turns=%d", driftHotspot.workflowName, driftHotspot.minTurns, driftHotspot.maxTurns),
+ })
+ }
+
+ var toolingHotspot *workflowObservabilityStats
+ for _, stats := range workflowStats {
+ friction := stats.missingTools + stats.mcpFailures + stats.missingData
+ if friction == 0 {
+ continue
+ }
+ if toolingHotspot == nil || friction > (toolingHotspot.missingTools+toolingHotspot.mcpFailures+toolingHotspot.missingData) {
+ toolingHotspot = stats
+ }
+ }
+ if toolingHotspot != nil {
+ friction := toolingHotspot.missingTools + toolingHotspot.mcpFailures + toolingHotspot.missingData
+ severity := "medium"
+ if toolingHotspot.mcpFailures > 0 || friction >= 4 {
+ severity = "high"
+ }
+ insights = append(insights, ObservabilityInsight{
+ Category: "tooling",
+ Severity: severity,
+ Title: "Capability hotspot identified",
+ Summary: fmt.Sprintf("Workflow %q produced the most capability friction: %d missing tool(s), %d MCP failure(s), and %d missing data signal(s).", toolingHotspot.workflowName, toolingHotspot.missingTools, toolingHotspot.mcpFailures, toolingHotspot.missingData),
+ Evidence: fmt.Sprintf("workflow=%s missing_tools=%d mcp_failures=%d missing_data=%d", toolingHotspot.workflowName, toolingHotspot.missingTools, toolingHotspot.mcpFailures, toolingHotspot.missingData),
+ })
+ }
+
+ var networkHotspot *workflowObservabilityStats
+ var networkRate float64
+ for _, stats := range workflowStats {
+ if stats.totalNet == 0 || stats.blocked == 0 {
+ continue
+ }
+ rate := float64(stats.blocked) / float64(stats.totalNet)
+ if networkHotspot == nil || rate > networkRate {
+ networkHotspot = stats
+ networkRate = rate
+ }
+ }
+ if networkHotspot != nil {
+ severity := "medium"
+ if networkRate >= 0.5 || networkHotspot.blocked >= 10 {
+ severity = "high"
+ }
+ insights = append(insights, ObservabilityInsight{
+ Category: "network",
+ Severity: severity,
+ Title: "Network friction hotspot identified",
+ Summary: fmt.Sprintf("Workflow %q had the highest firewall block pressure with %d blocked request(s) out of %d total (%.0f%%).", networkHotspot.workflowName, networkHotspot.blocked, networkHotspot.totalNet, networkRate*100),
+ Evidence: fmt.Sprintf("workflow=%s blocked=%d total=%d", networkHotspot.workflowName, networkHotspot.blocked, networkHotspot.totalNet),
+ })
+ }
+
+ if writeRuns > 0 || readOnlyRuns > 0 {
+ insights = append(insights, ObservabilityInsight{
+ Category: "actuation",
+ Severity: "info",
+ Title: "Actuation mix summarized",
+ Summary: fmt.Sprintf("Across %d run(s), %d executed write-capable safe outputs and %d stayed read-only.", len(processedRuns), writeRuns, readOnlyRuns),
+ Evidence: fmt.Sprintf("write_runs=%d read_only_runs=%d", writeRuns, readOnlyRuns),
+ })
+ }
+
+ totalToolCalls := 0
+ for _, tool := range toolUsage {
+ totalToolCalls += tool.TotalCalls
+ }
+ if len(toolUsage) > 0 && totalToolCalls > 0 {
+ topTool := toolUsage[0]
+ share := float64(topTool.TotalCalls) / float64(totalToolCalls)
+ if share >= 0.5 {
+ severity := "info"
+ if share >= 0.7 {
+ severity = "medium"
+ }
+ insights = append(insights, ObservabilityInsight{
+ Category: "tooling",
+ Severity: severity,
+ Title: "Tool concentration observed",
+ Summary: fmt.Sprintf("Tool %q accounted for %.0f%% of observed tool calls, which suggests the workflow fleet depends heavily on a narrow capability path.", topTool.Name, share*100),
+ Evidence: fmt.Sprintf("tool=%s calls=%d total_calls=%d", topTool.Name, topTool.TotalCalls, totalToolCalls),
+ })
+ }
+ }
+
+ return insights
+}
+
+func renderObservabilityInsights(insights []ObservabilityInsight) {
+ for _, insight := range insights {
+ icon := "[info]"
+ switch insight.Severity {
+ case "critical":
+ icon = "[critical]"
+ case "high":
+ icon = "[high]"
+ case "medium":
+ icon = "[medium]"
+ case "low":
+ icon = "[low]"
+ }
+
+ fmt.Fprintf(os.Stderr, " %s %s [%s]\n", icon, insight.Title, insight.Category)
+ fmt.Fprintf(os.Stderr, " %s\n", insight.Summary)
+ if strings.TrimSpace(insight.Evidence) != "" {
+ fmt.Fprintf(os.Stderr, " Evidence: %s\n", insight.Evidence)
+ }
+ fmt.Fprintln(os.Stderr)
+ }
+}
diff --git a/pkg/cli/observability_insights_test.go b/pkg/cli/observability_insights_test.go
new file mode 100644
index 0000000000..548459607d
--- /dev/null
+++ b/pkg/cli/observability_insights_test.go
@@ -0,0 +1,121 @@
+//go:build !integration
+
+package cli
+
+import (
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/github/gh-aw/pkg/workflow"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestBuildAuditObservabilityInsights(t *testing.T) {
+ processedRun := ProcessedRun{
+ Run: WorkflowRun{
+ Turns: 11,
+ SafeItemsCount: 2,
+ },
+ MissingTools: []MissingToolReport{{Tool: "terraform"}},
+ MCPFailures: []MCPFailureReport{{ServerName: "github"}},
+ MissingData: []MissingDataReport{{DataType: "issue_body"}},
+ FirewallAnalysis: &FirewallAnalysis{
+ TotalRequests: 20,
+ BlockedRequests: 8,
+ AllowedRequests: 12,
+ },
+ RedactedDomainsAnalysis: &RedactedDomainsAnalysis{TotalDomains: 3},
+ }
+
+ metrics := MetricsData{Turns: 11}
+ toolUsage := []ToolUsageInfo{
+ {Name: "bash", CallCount: 4},
+ {Name: "github_issue_read", CallCount: 2},
+ {Name: "grep", CallCount: 1},
+ {Name: "sed", CallCount: 1},
+ }
+ createdItems := []CreatedItemReport{{Type: "create_issue"}}
+
+ insights := buildAuditObservabilityInsights(processedRun, metrics, toolUsage, createdItems)
+ require.Len(t, insights, 5, "expected five audit insights from the supplied signals")
+
+ titles := make([]string, 0, len(insights))
+ for _, insight := range insights {
+ titles = append(titles, insight.Title)
+ }
+
+ assert.Contains(t, titles, "Adaptive execution path")
+ assert.Contains(t, titles, "Write path executed")
+ assert.Contains(t, titles, "Capability friction detected")
+ assert.Contains(t, titles, "Network friction detected")
+ assert.Contains(t, titles, "Sensitive destinations were redacted")
+}
+
+func TestBuildLogsObservabilityInsights(t *testing.T) {
+ processedRuns := []ProcessedRun{
+ {
+ Run: WorkflowRun{WorkflowName: "triage", Conclusion: "failure", Turns: 3, SafeItemsCount: 0},
+ MissingTools: []MissingToolReport{{Tool: "terraform"}},
+ FirewallAnalysis: &FirewallAnalysis{TotalRequests: 10, BlockedRequests: 1},
+ },
+ {
+ Run: WorkflowRun{WorkflowName: "triage", Conclusion: "failure", Turns: 9, SafeItemsCount: 1},
+ MCPFailures: []MCPFailureReport{{ServerName: "github"}},
+ FirewallAnalysis: &FirewallAnalysis{TotalRequests: 10, BlockedRequests: 7},
+ },
+ {
+ Run: WorkflowRun{WorkflowName: "docs", Conclusion: "success", Turns: 2, SafeItemsCount: 1},
+ },
+ }
+
+ toolUsage := []ToolUsageSummary{
+ {Name: "bash", TotalCalls: 14},
+ {Name: "github_issue_read", TotalCalls: 6},
+ }
+
+ insights := buildLogsObservabilityInsights(processedRuns, toolUsage)
+ require.NotEmpty(t, insights, "expected aggregated logs insights")
+
+ var combined []string
+ for _, insight := range insights {
+ combined = append(combined, insight.Title+" "+insight.Summary)
+ }
+ text := strings.Join(combined, "\n")
+
+ assert.Contains(t, text, "Failure hotspot identified")
+ assert.Contains(t, text, "Execution drift observed")
+ assert.Contains(t, text, "Capability hotspot identified")
+ assert.Contains(t, text, "Network friction hotspot identified")
+ assert.Contains(t, text, "Actuation mix summarized")
+ assert.Contains(t, text, "Tool concentration observed")
+}
+
+func TestBuildAuditDataIncludesObservabilityInsights(t *testing.T) {
+ processedRun := ProcessedRun{
+ Run: WorkflowRun{
+ DatabaseID: 42,
+ WorkflowName: "insight-test",
+ Status: "completed",
+ Conclusion: "success",
+ Duration: 2 * time.Minute,
+ Turns: 7,
+ SafeItemsCount: 1,
+ },
+ }
+
+ metrics := workflow.LogMetrics{
+ Turns: 7,
+ ToolCalls: []workflow.ToolCallInfo{
+ {Name: "bash", CallCount: 3},
+ {Name: "github_issue_read", CallCount: 2},
+ {Name: "grep", CallCount: 1},
+ {Name: "sed", CallCount: 1},
+ },
+ }
+
+ auditData := buildAuditData(processedRun, metrics, nil)
+ require.NotEmpty(t, auditData.ObservabilityInsights, "audit data should expose observability insights")
+ assert.Equal(t, "execution", auditData.ObservabilityInsights[0].Category)
+}
diff --git a/pkg/parser/schema_test.go b/pkg/parser/schema_test.go
index 515df3720a..0fe8adf86e 100644
--- a/pkg/parser/schema_test.go
+++ b/pkg/parser/schema_test.go
@@ -194,3 +194,25 @@ func TestGetSafeOutputTypeKeys(t *testing.T) {
}
}
}
+
+func TestValidateMainWorkflowFrontmatterWithSchema_AllowsObservabilityJobSummary(t *testing.T) {
+ frontmatter := map[string]any{
+ "on": "push",
+ "observability": map[string]any{
+ "job-summary": "on",
+ },
+ }
+
+ tempFile := "/tmp/gh-aw/test_observability_frontmatter.md"
+ if err := os.MkdirAll("/tmp/gh-aw", 0755); err != nil {
+ t.Fatalf("Failed to create temp directory: %v", err)
+ }
+ if err := os.WriteFile(tempFile, []byte("---\non: push\nobservability:\n job-summary: on\n---\n"), 0644); err != nil {
+ t.Fatalf("Failed to create temp file: %v", err)
+ }
+ defer os.Remove(tempFile)
+
+ if err := ValidateMainWorkflowFrontmatterWithSchemaAndLocation(frontmatter, tempFile); err != nil {
+ t.Fatalf("Expected observability config to validate, got: %v", err)
+ }
+}
diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json
index 9428fb27ae..22ab3f984b 100644
--- a/pkg/parser/schemas/main_workflow_schema.json
+++ b/pkg/parser/schemas/main_workflow_schema.json
@@ -8235,6 +8235,18 @@
},
"additionalProperties": false
},
+ "observability": {
+ "type": "object",
+ "description": "Optional observability output settings for workflow runs.",
+ "properties": {
+ "job-summary": {
+ "type": "string",
+ "enum": ["on", "off"],
+ "description": "If set to 'on', append a compact observability section to the GitHub Actions job summary. Defaults to off when omitted."
+ }
+ },
+ "additionalProperties": false
+ },
"bots": {
"type": "array",
"description": "Allow list of bot identifiers that can trigger the workflow even if they don't meet the required role permissions. When the actor is in this list, the bot must be active (installed) on the repository to trigger the workflow.",
diff --git a/pkg/workflow/compiler_yaml_ai_execution.go b/pkg/workflow/compiler_yaml_ai_execution.go
index ae8ea049b3..320e25feb0 100644
--- a/pkg/workflow/compiler_yaml_ai_execution.go
+++ b/pkg/workflow/compiler_yaml_ai_execution.go
@@ -5,6 +5,31 @@ import (
"strings"
)
+func getObservabilityJobSummaryMode(data *WorkflowData) string {
+ if data == nil {
+ return ""
+ }
+
+ mode := ""
+ if data.ParsedFrontmatter != nil && data.ParsedFrontmatter.Observability != nil {
+ mode = data.ParsedFrontmatter.Observability.JobSummary
+ }
+
+ if mode == "" && data.RawFrontmatter != nil {
+ if rawObservability, ok := data.RawFrontmatter["observability"].(map[string]any); ok {
+ if rawMode, ok := rawObservability["job-summary"].(string); ok {
+ mode = rawMode
+ }
+ }
+ }
+
+ if mode == "off" {
+ return ""
+ }
+
+ return mode
+}
+
// generateEngineExecutionSteps generates the GitHub Actions steps for executing the AI engine
func (c *Compiler) generateEngineExecutionSteps(yaml *strings.Builder, data *WorkflowData, engine CodingAgentEngine, logFile string) {
@@ -91,6 +116,29 @@ func (c *Compiler) generateMCPGatewayLogParsing(yaml *strings.Builder) {
yaml.WriteString(" await main();\n")
}
+// generateObservabilitySummary generates an opt-in step that synthesizes a compact
+// observability section for the GitHub Actions step summary from existing runtime files.
+func (c *Compiler) generateObservabilitySummary(yaml *strings.Builder, data *WorkflowData) {
+ mode := getObservabilityJobSummaryMode(data)
+ if mode == "" {
+ return
+ }
+
+ compilerYamlLog.Printf("Generating observability step summary: mode=%s", mode)
+
+ yaml.WriteString(" - name: Generate observability summary\n")
+ yaml.WriteString(" if: always()\n")
+ fmt.Fprintf(yaml, " uses: %s\n", GetActionPin("actions/github-script"))
+ yaml.WriteString(" env:\n")
+ fmt.Fprintf(yaml, " GH_AW_OBSERVABILITY_JOB_SUMMARY: %q\n", mode)
+ yaml.WriteString(" with:\n")
+ yaml.WriteString(" script: |\n")
+ yaml.WriteString(" const { setupGlobals } = require('" + SetupActionDestination + "/setup_globals.cjs');\n")
+ yaml.WriteString(" setupGlobals(core, github, context, exec, io);\n")
+ yaml.WriteString(" const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_observability_summary.cjs');\n")
+ yaml.WriteString(" await main(core);\n")
+}
+
// generateStopMCPGateway generates a step that stops the MCP gateway process using its PID from step output
// It passes the gateway port and API key to enable graceful shutdown via /close endpoint
func (c *Compiler) generateStopMCPGateway(yaml *strings.Builder, data *WorkflowData) {
diff --git a/pkg/workflow/compiler_yaml_main_job.go b/pkg/workflow/compiler_yaml_main_job.go
index 5148c4522a..64e4284062 100644
--- a/pkg/workflow/compiler_yaml_main_job.go
+++ b/pkg/workflow/compiler_yaml_main_job.go
@@ -468,6 +468,9 @@ func (c *Compiler) generateMainJobSteps(yaml *strings.Builder, data *WorkflowDat
}
}
+ // Optionally synthesize a compact observability section from runtime artifacts.
+ c.generateObservabilitySummary(yaml, data)
+
// Collect agent stdio logs path for unified upload
artifactPaths = append(artifactPaths, logFileFull)
diff --git a/pkg/workflow/frontmatter_types.go b/pkg/workflow/frontmatter_types.go
index f3fe8ca3af..bc1daaebf4 100644
--- a/pkg/workflow/frontmatter_types.go
+++ b/pkg/workflow/frontmatter_types.go
@@ -129,6 +129,11 @@ type RateLimitConfig struct {
IgnoredRoles []string `json:"ignored-roles,omitempty"` // Roles that are exempt from rate limiting (e.g., ["admin", "maintainer"])
}
+// ObservabilityConfig represents workflow observability options.
+type ObservabilityConfig struct {
+ JobSummary string `json:"job-summary,omitempty"`
+}
+
// FrontmatterConfig represents the structured configuration from workflow frontmatter
// This provides compile-time type safety and clearer error messages compared to map[string]any
type FrontmatterConfig struct {
@@ -188,6 +193,7 @@ type FrontmatterConfig struct {
// Metadata
Metadata map[string]string `json:"metadata,omitempty"` // Custom metadata key-value pairs
SecretMasking *SecretMaskingConfig `json:"secret-masking,omitempty"`
+ Observability *ObservabilityConfig `json:"observability,omitempty"`
// Rate limiting configuration
RateLimit *RateLimitConfig `json:"rate-limit,omitempty"`
diff --git a/pkg/workflow/frontmatter_types_test.go b/pkg/workflow/frontmatter_types_test.go
index afffbedada..3c47a39a47 100644
--- a/pkg/workflow/frontmatter_types_test.go
+++ b/pkg/workflow/frontmatter_types_test.go
@@ -191,6 +191,27 @@ func TestParseFrontmatterConfig(t *testing.T) {
}
})
+ t.Run("handles observability configuration", func(t *testing.T) {
+ frontmatter := map[string]any{
+ "observability": map[string]any{
+ "job-summary": "on",
+ },
+ }
+
+ config, err := ParseFrontmatterConfig(frontmatter)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if config.Observability == nil {
+ t.Fatal("Observability should not be nil")
+ }
+
+ if config.Observability.JobSummary != "on" {
+ t.Errorf("JobSummary = %q, want %q", config.Observability.JobSummary, "on")
+ }
+ })
+
t.Run("handles jobs configuration", func(t *testing.T) {
frontmatter := map[string]any{
"jobs": map[string]any{
diff --git a/pkg/workflow/observability_job_summary_test.go b/pkg/workflow/observability_job_summary_test.go
new file mode 100644
index 0000000000..fd2765e5e8
--- /dev/null
+++ b/pkg/workflow/observability_job_summary_test.go
@@ -0,0 +1,85 @@
+//go:build !integration
+
+package workflow
+
+import (
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+)
+
+func TestCompileWorkflow_IncludesObservabilitySummaryStepWhenOptedIn(t *testing.T) {
+ tmpDir := t.TempDir()
+ workflowPath := filepath.Join(tmpDir, "observability-summary.md")
+ content := `---
+on: push
+permissions:
+ contents: read
+observability:
+ job-summary: on
+engine: copilot
+---
+
+# Test Observability Summary
+`
+
+ if err := os.WriteFile(workflowPath, []byte(content), 0o644); err != nil {
+ t.Fatalf("Failed to write workflow: %v", err)
+ }
+
+ compiler := NewCompiler()
+ if err := compiler.CompileWorkflow(workflowPath); err != nil {
+ t.Fatalf("Unexpected compile error: %v", err)
+ }
+
+ lockPath := filepath.Join(tmpDir, "observability-summary.lock.yml")
+ lockContent, err := os.ReadFile(lockPath)
+ if err != nil {
+ t.Fatalf("Failed to read lock file: %v", err)
+ }
+
+ compiled := string(lockContent)
+ if !strings.Contains(compiled, "- name: Generate observability summary") {
+ t.Fatal("Expected observability summary step to be generated")
+ }
+ if !strings.Contains(compiled, "GH_AW_OBSERVABILITY_JOB_SUMMARY: \"on\"") {
+ t.Fatal("Expected observability summary mode env var to be set")
+ }
+ if !strings.Contains(compiled, "require('${{ runner.temp }}/gh-aw/actions/generate_observability_summary.cjs')") {
+ t.Fatal("Expected generated workflow to load generate_observability_summary.cjs")
+ }
+}
+
+func TestCompileWorkflow_DoesNotIncludeObservabilitySummaryStepByDefault(t *testing.T) {
+ tmpDir := t.TempDir()
+ workflowPath := filepath.Join(tmpDir, "no-observability-summary.md")
+ content := `---
+on: push
+permissions:
+ contents: read
+engine: copilot
+---
+
+# Test No Observability Summary
+`
+
+ if err := os.WriteFile(workflowPath, []byte(content), 0o644); err != nil {
+ t.Fatalf("Failed to write workflow: %v", err)
+ }
+
+ compiler := NewCompiler()
+ if err := compiler.CompileWorkflow(workflowPath); err != nil {
+ t.Fatalf("Unexpected compile error: %v", err)
+ }
+
+ lockPath := filepath.Join(tmpDir, "no-observability-summary.lock.yml")
+ lockContent, err := os.ReadFile(lockPath)
+ if err != nil {
+ t.Fatalf("Failed to read lock file: %v", err)
+ }
+
+ if strings.Contains(string(lockContent), "- name: Generate observability summary") {
+ t.Fatal("Did not expect observability summary step when feature is not configured")
+ }
+}