Implement Github action

kaste · kaste · commit 8a7cf411772e · 2025-11-05T18:06:51.000+01:00
Implements a Github action. Refer gh_action/README.md for its usage. Ref packagecontrol/thecrawl#66 Ref packagecontrol/thecrawl#166
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,4 @@ dist/
 *.sublime-workspace
 st_package_reviewer/_version.py
 uv.lock
+.thecrawl/
diff --git a/README.md b/README.md
@@ -16,6 +16,11 @@ reported by the tool,
 [refer to the wiki][wiki].
 
 
+## Usage as a GitHub Action
+
+See gh_action/README.md for how to run this as a composite action that runs on channel/registry PRs.
+
+
 ## Installation
 
 Requires **Python 3.13**.
diff --git a/gh_action/README.md b/gh_action/README.md
@@ -0,0 +1,55 @@
+# PR Channel Action
+
+This composite action diffs a Package Control channel registry between a PR’s base and head commits, crawls only the changed and added packages using your thecrawl, downloads each release archive, and runs `st_package_reviewer` on the extracted contents. The job fails if any crawl, download, unzip, or review step fails.
+
+## Inputs
+
+- `pr` (required): Full PR URL, e.g. `https://github.com/wbond/package_control_channel/pull/9236`.
+- `file` (optional): Path to the channel or repository file inside the repo. Default: `repository.json`.
+- `thecrawl` (optional): Path to a local `thecrawl` repo, or a git URL to clone a fork/branch/commit. Default: `https://github.com/packagecontrol/thecrawl`
+
+You can pin a ref with `@ref` for HTTPS URLs, e.g.:
+  - `https://github.com/packagecontrol/thecrawl.git@feature-branch`
+  - `https://github.com/packagecontrol/thecrawl.git@v1.2.3`
+  - `https://github.com/packagecontrol/thecrawl.git@abc1234`
+
+## Example Usage
+
+```yaml
+name: Channel Diff and Review
+on:
+  pull_request:
+    paths:
+      - 'repository.json'
+
+jobs:
+  diff-and-review:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Diff and review changed/added packages
+        uses: ./gh_action
+        with:
+          pr: ${{ github.event.pull_request.html_url }}
+          file: repository.json
+          # thecrawl: ../thecrawl                      # optional path
+          # thecrawl: https://github.com/packagecontrol/thecrawl@my-branch   # optional URL with ref
+```
+
+## Notes
+
+- The action ensures `uv` is available via `astral-sh/setup-uv`. GitHub’s hosted runners include `gh` (GitHub CLI) by default.
+- If `thecrawl` is not provided, the action clones `https://github.com/packagecontrol/thecrawl`.
+- Network access is required to fetch raw files, zipballs, and the GitHub API. For GitHub zipball downloads, the action falls back to `gh api` if `curl` fails.
+
+
+## What It Does
+
+- Resolves base/head repos and SHAs via `gh pr view`.
+- Builds a registry JSON at both SHAs using your local or cloned `thecrawl` (`uv run -m scripts.generate_registry`).
+- Diffs registries by package name; prints Removed/Changed/Added to stderr and emits changed+added names to stdout.
+- For each changed/added package:
+  - Runs `uv run -m scripts.crawl --registry <target-registry> --workspace <ws.json> --name <pkg>`.
+  - Reads the workspace JSON and downloads each release zip.
+  - Unpacks the zip and runs `uv run st_package_reviewer <extracted_dir>`.
+  - Aggregates failures and fails the job if any occurred.
diff --git a/gh_action/action.sh b/gh_action/action.sh
@@ -0,0 +1,276 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 --pr <pr_url> [--file <path>] [--thecrawl <path-or-url[@ref]>]
+
+Arguments:
+  --pr        GitHub Pull Request URL (e.g. https://github.com/wbond/package_control_channel/pull/9236)
+  --file      Path within the repo to the channel JSON (default: repository.json)
+  --thecrawl  Path to local thecrawl repo or URL to clone (supports @ref to pin, default: https://github.com/packagecontrol/thecrawl)
+
+Requires: gh, uv
+EOF
+}
+
+PR_URL=""
+REL_PATH="repository.json"
+THECRAWL="https://github.com/packagecontrol/thecrawl"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --pr)
+      PR_URL="$2"; shift 2;;
+    --file)
+      REL_PATH="$2"; shift 2;;
+    --thecrawl)
+      THECRAWL="$2"; shift 2;;
+    -h|--help)
+      usage; exit 0;;
+    *)
+      echo "Unknown argument: $1" >&2; usage; exit 2;;
+  esac
+done
+
+if [[ -z "$PR_URL" ]]; then
+  echo "Error: --pr is required" >&2; usage; exit 2
+fi
+
+if ! command -v gh >/dev/null 2>&1; then
+  echo "Error: gh (GitHub CLI) is required" >&2; exit 2
+fi
+if ! command -v uv >/dev/null 2>&1; then
+  echo "Error: uv is required" >&2; exit 2
+fi
+
+# Robust ZIP downloader with fallback to gh for GitHub zipball URLs
+download_zip() {
+  local url="$1" dest="$2"
+  mkdir -p "$(dirname "$dest")"
+  rm -f "$dest.part" "$dest"
+  # First try curl with retries
+  if curl -fSL --retry 3 --retry-all-errors --connect-timeout 15 --max-time 600 \
+      -o "$dest.part" "$url"; then
+    mv "$dest.part" "$dest"
+    return 0
+  fi
+  rm -f "$dest.part"
+  # Fallback for codeload.github.com/<owner>/<repo>/zip/<ref>
+  if [[ "$url" =~ ^https://codeload\.github\.com/([^/]+)/([^/]+)/zip/(.+)$ ]]; then
+    local owner="${BASH_REMATCH[1]}" repo="${BASH_REMATCH[2]}" ref="${BASH_REMATCH[3]}"
+    echo "    curl failed; using gh api zipball for $owner/$repo@$ref" >&2
+    if gh api -H "Accept: application/octet-stream" \
+        "repos/${owner}/${repo}/zipball/${ref}" > "$dest.part"; then
+      mv "$dest.part" "$dest"
+      return 0
+    fi
+    rm -f "$dest.part"
+  fi
+  return 1
+}
+
+# Normalize relative path (strip leading ./)
+REL_PATH="${REL_PATH#./}"
+
+echo "Resolving PR metadata via gh: $PR_URL" >&2
+
+# Derive base repo from PR URL (owner/repo)
+BASE_NWO=$(echo "$PR_URL" | awk -F/ '{print $4"/"$5}')
+# Head repo from PR data (may be same as base)
+HEAD_NWO=$(gh pr view "$PR_URL" --json headRepository -q '.headRepository.nameWithOwner')
+BASE_SHA=$(gh pr view "$PR_URL" --json baseRefOid -q .baseRefOid)
+HEAD_SHA=$(gh pr view "$PR_URL" --json headRefOid -q .headRefOid)
+
+if [[ -z "$BASE_NWO" || -z "$BASE_SHA" || -z "$HEAD_SHA" ]]; then
+  echo "Error: failed to resolve PR details via gh" >&2
+  echo "  PR:        $PR_URL" >&2
+  echo "  base nwo:  ${BASE_NWO:-<empty>}" >&2
+  echo "  base sha:  ${BASE_SHA:-<empty>}" >&2
+  echo "  head nwo:  ${HEAD_NWO:-<empty>} (may match base)" >&2
+  echo "  head sha:  ${HEAD_SHA:-<empty>}" >&2
+  echo "Hint:" >&2
+  echo "  - Commands used: 'gh pr view <url> --json baseRefOid,headRefOid,headRepository'" >&2
+  exit 2
+fi
+
+# Fallback: if HEAD_NWO is empty, assume same as base (same-repo PR)
+if [[ -z "$HEAD_NWO" ]]; then
+  HEAD_NWO="$BASE_NWO"
+fi
+
+BASE_URL="https://raw.githubusercontent.com/${BASE_NWO}/${BASE_SHA}/${REL_PATH}"
+HEAD_URL="https://raw.githubusercontent.com/${HEAD_NWO}/${HEAD_SHA}/${REL_PATH}"
+
+echo "Base URL:   $BASE_URL" >&2
+echo "Target URL: $HEAD_URL" >&2
+
+# Locate or clone thecrawl
+resolve_crawler_path() {
+  if [[ -n "$THECRAWL" ]]; then
+    if [[ "$THECRAWL" =~ ^https?:// || "$THECRAWL" =~ ^git@ ]]; then
+      local repo_path="${GITHUB_WORKSPACE:-$PWD}/.thecrawl"
+      # For HTTPS URLs, allow trailing @ref
+      local url_base="$THECRAWL"
+      local ref=""
+      if [[ "$url_base" =~ ^https?://.+@.+$ ]]; then
+        ref="${url_base##*@}"
+        url_base="${url_base%*@$ref}"
+      fi
+
+      if [[ -d "$repo_path/.git" ]]; then
+        # Existing clone: update remote and optionally checkout ref
+        git -C "$repo_path" remote set-url origin "$url_base" >/dev/null 2>&1 || true
+        if [[ -n "$ref" ]]; then
+          echo "Checking out thecrawl ref '$ref' in $repo_path" >&2
+          git -C "$repo_path" fetch --depth 1 origin "$ref" >&2
+          git -C "$repo_path" checkout -q FETCH_HEAD >&2
+        fi
+        echo "$repo_path"; return
+      fi
+
+      if [[ -n "$ref" ]]; then
+        echo "Cloning thecrawl $url_base at ref '$ref' into $repo_path" >&2
+        git init -q "$repo_path" >&2
+        git -C "$repo_path" remote add origin "$url_base" >&2
+        git -C "$repo_path" fetch --depth 1 origin "$ref" >&2
+        git -C "$repo_path" checkout -q FETCH_HEAD >&2
+      else
+        echo "Cloning thecrawl from $url_base into $repo_path" >&2
+        git clone --depth 1 "$url_base" "$repo_path" >&2
+      fi
+      echo "$repo_path"; return
+    fi
+    echo "$THECRAWL"; return
+  fi
+  echo "Error: could not resolve thecrawl path" >&2
+  return 2
+}
+
+CRAWLER_REPO=$(resolve_crawler_path)
+if [[ ! -d "$CRAWLER_REPO" ]]; then
+  echo "Error: could not find or clone thecrawl" >&2
+  exit 2
+fi
+
+echo "Using thecrawl at: $CRAWLER_REPO" >&2
+
+TMPDIR=$(mktemp -d)
+trap 'rm -rf "$TMPDIR"' EXIT
+
+BASE_REG="$TMPDIR/base_registry.json"
+HEAD_REG="$TMPDIR/head_registry.json"
+
+echo "Generating base registry…" >&2
+(cd "$CRAWLER_REPO" && uv run -m scripts.generate_registry -c "$BASE_URL" -o "$BASE_REG")
+
+echo "Generating target registry…" >&2
+(cd "$CRAWLER_REPO" && uv run -m scripts.generate_registry -c "$HEAD_URL" -o "$HEAD_REG")
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# Invoke Python diff to print results and collect changed+added package names
+mapfile -t PKGS < <(python3 "$SCRIPT_DIR/diff_repository.py" --base-file "$BASE_REG" --target-file "$HEAD_REG" --print-changed-added \
+  | tr -d '\r' \
+  | sed '/^$/d')
+
+if [[ ${#PKGS[@]} -eq 0 ]]; then
+  echo "No changed or added packages to crawl." >&2
+  exit 0
+fi
+
+echo "Crawling ${#PKGS[@]} package(s) from target registry…" >&2
+failures=0
+for pkg in "${PKGS[@]}"; do
+  [[ -z "$pkg" ]] && continue
+  echo "- Crawling: $pkg" >&2
+  # Use workspace file output for robust parsing
+  wsdir="$TMPDIR/workspaces"
+  mkdir -p "$wsdir"
+  wsfile="$wsdir/${pkg}.json"
+  set +e
+  (cd "$CRAWLER_REPO" && uv run -m scripts.crawl --registry "$HEAD_REG" --workspace "$wsfile" --name "$pkg" 2> >(cat >&2))
+  STATUS=$?
+  set -e
+  if [[ $STATUS -ne 0 || ! -s "$wsfile" ]]; then
+    echo "  ! Crawl failed for $pkg" >&2
+    failures=$((failures+1))
+    continue
+  fi
+
+  # Extract release URLs (and versions) from workspace
+  mapfile -t RELS < <(python3 "$SCRIPT_DIR/parse_workspace.py" "$wsfile" "$pkg")
+  if [[ ${#RELS[@]} -eq 0 ]]; then
+    echo "  ! No releases found for $pkg" >&2
+    failures=$((failures+1))
+    continue
+  fi
+
+  i=0
+  for rec in "${RELS[@]}"; do
+    url="${rec%%$'\t'*}"
+    ver="${rec#*$'\t'}"
+    # if no tab present, ver==url; fix that
+    if [[ "$ver" == "$url" ]]; then ver=""; fi
+
+    i=$((i+1))
+    disp_ver="$ver"
+    [[ -z "$disp_ver" ]] && disp_ver="r$i"
+    # sanitize for filesystem path
+    safe_ver=$(printf "%s" "$disp_ver" | tr -d '\r' | sed 's/[^A-Za-z0-9._-]/_/g')
+
+    workdir="$TMPDIR/review/$pkg/$safe_ver"
+    mkdir -p "$workdir"
+
+    zipfile="$workdir/pkg.zip"
+    echo "  Downloading release $disp_ver: $url" >&2
+    if ! download_zip "$url" "$zipfile"; then
+      echo "  ! Download failed for $pkg@$disp_ver" >&2
+      failures=$((failures+1))
+      continue
+    fi
+
+    echo "  Unpacking…" >&2
+    # Prefer unzip; fallback to Python zipfile
+    if command -v unzip >/dev/null 2>&1; then
+      if ! unzip -q -o "$zipfile" -d "$workdir"; then
+        echo "  ! Unzip failed for $pkg@$disp_ver" >&2
+        failures=$((failures+1))
+        continue
+      fi
+    else
+      python3 - "$zipfile" "$workdir" <<'PY'
+import sys, zipfile, os
+zf = zipfile.ZipFile(sys.argv[1])
+zf.extractall(sys.argv[2])
+PY
+      if [[ $? -ne 0 ]]; then
+        echo "  ! Unzip failed for $pkg@$disp_ver (python)" >&2
+        failures=$((failures+1))
+        continue
+      fi
+    fi
+
+    # Determine the top-level extracted directory
+    topdir=$(find "$workdir" -mindepth 1 -maxdepth 1 -type d | head -n1)
+    if [[ -z "$topdir" ]]; then
+      echo "  ! Could not locate extracted folder for $pkg@$disp_ver" >&2
+      failures=$((failures+1))
+      continue
+    fi
+
+    echo "  Reviewing with st_package_reviewer: $topdir" >&2
+    if ! uv run st_package_reviewer "$topdir"; then
+      echo "  ! Review failed for $pkg@$disp_ver" >&2
+      failures=$((failures+1))
+      continue
+    fi
+  done
+done
+
+if [[ $failures -gt 0 ]]; then
+  echo "Completed crawling with $failures failure(s)." >&2
+  exit 1
+else
+  echo "Completed crawling successfully." >&2
+  exit 0
+fi
diff --git a/gh_action/action.yml b/gh_action/action.yml
@@ -0,0 +1,25 @@
+name: Package Reviewer PR action
+description: Review a repository or channel PR
+inputs:
+  pr:
+    description: Pull Request URL (e.g. https://github.com/wbond/package_control_channel/pull/123)
+    required: true
+  file:
+    description: Path to channel JSON within the repo
+    default: repository.json
+    required: false
+  thecrawl:
+    description: "Optional path to a local thecrawl repo, or a URL to clone (supports @ref to pin)"
+    default: https://github.com/packagecontrol/thecrawl
+    required: false
+runs:
+  using: composite
+  steps:
+    - name: Ensure uv is available
+      uses: astral-sh/setup-uv@v3
+
+    - name: Run Package Reviewer
+      shell: bash
+      run: |
+        set -euo pipefail
+        "${{ github.action_path }}/action.sh" --pr "${{ inputs.pr }}" --file "${{ inputs.file }}" --thecrawl "${{ inputs.thecrawl }}"
diff --git a/gh_action/diff_repository.py b/gh_action/diff_repository.py
diff --git a/gh_action/parse_workspace.py b/gh_action/parse_workspace.py