Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
292 changes: 292 additions & 0 deletions .github/models/pr-desc/build_context.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
// .github/models/pr-desc/build_context.js
// This script assembles the context that is fed to the PR summarization model.
// Every section below adds a different type of hint (diff, commit messages,
// module docs, etc.) so the model can understand which parts of the repo the
// PR is touching.
const { execSync } = require('child_process');
const { readFileSync, readdirSync, existsSync } = require('fs');
const fs = require('fs');
const { join } = require('path');

function arg(name, def) {
const i = process.argv.indexOf(name);
return (i > -1 && process.argv[i + 1]) ? process.argv[i + 1] : def;
}
const base = arg('--base', 'origin/main');
const head = arg('--head', 'HEAD');

function sh(cmd) {
try { return execSync(cmd, { encoding: 'utf8' }).trim(); }
catch { return ''; }
}
function clip(s, max) {
if (!s) return '';
return s.length <= max ? s : s.slice(0, max) + `\n\n[truncated ${s.length - max} chars]`;
}

// ---------- 0) Rule Loading ----------
const rulesPath = '.github/models/pr-desc/rules.json';
let rules = { modules: [], fallbackModule: 'Misc' };
if (existsSync(rulesPath)) {
try { rules = JSON.parse(readFileSync(rulesPath, 'utf8')); }
catch { /* ignore parse error; keep defaults */ }
}
const compiledPatterns = rules.modules.map(m => ({
name: m.name,
weight: Number(m.weight || 1),
regs: (m.patterns || []).map(p => new RegExp(p))
}));

function classifyModule(filepath) {
for (const m of compiledPatterns) {
if (m.regs.some(r => r.test(filepath))) return { module: m.name, weight: m.weight };
}
return { module: rules.fallbackModule || 'Misc', weight: 1 };
}

// Utility helpers for normalising and scoring keywords that describe
// "interesting" parts of the change. These are later used to selectively load
// module documentation.
function normalizeToken(token) {
return token ? token.toLowerCase().replace(/[^a-z0-9]/g, '') : '';
}
const interestTokens = new Map();
function bumpToken(token, weight = 1) {
const norm = normalizeToken(token);
if (!norm) return;
interestTokens.set(norm, (interestTokens.get(norm) || 0) + weight);
}

// ---------- 1) Overview / Modules Source Documents ----------
const ctxRoot = '.github/models/pr-desc/context';
let overview = '';
const overviewPath = join(ctxRoot, 'overview.md');
if (existsSync(overviewPath)) {
overview = readFileSync(overviewPath, 'utf8');
}
overview = clip(overview, 8000);

// ---------- 2) Git diff/numstat ----------
const nameStatusRaw = sh(`git diff --name-status -M -C ${base}...${head}`);
const statRaw = sh(`git diff --stat ${base}...${head}`);
const numstatRaw = sh(`git diff --numstat -M -C ${base}...${head}`);
const commitSubjects = sh(`git log --pretty=%s ${base}..${head}`).split('\n').filter(Boolean);
const commitBodiesRaw = sh(`git log --pretty=%B ${base}..${head}`);
const commitBodies = commitBodiesRaw.split('\n\n').map(s => s.trim()).filter(Boolean).slice(0, 10).map(s => clip(s, 800));

// name-status parsing (status, path[, path2])
// M A D R100 old -> new format is separated by tabs
const changedFiles = [];
if (nameStatusRaw) {
for (const line of nameStatusRaw.split('\n')) {
if (!line.trim()) continue;
const parts = line.split('\t');
const status = parts[0]; // e.g., 'M', 'A', 'D', 'R100'
const from = parts[1];
const to = parts[2] || parts[1];
changedFiles.push({ status, from, to, path: to });
// Directory / filename tokens tell us which doc hints are relevant.
const segments = to.split('/').slice(0, 3); // favour the upper path for module hints
segments.forEach((seg, idx) => bumpToken(seg, Math.max(1, 3 - idx)));
const baseName = to.split('/').pop();
if (baseName) {
const [stem] = baseName.split('.');
bumpToken(stem, 1);
}
}
}

// numstat parsing (added removed path[, path2])
const churnMap = new Map(); // key: path(to), value: {added, removed}
if (numstatRaw) {
for (const line of numstatRaw.split('\n')) {
if (!line.trim()) continue;
const parts = line.split('\t');
if (parts.length < 3) continue;
const added = parts[0] === '-' ? 0 : parseInt(parts[0], 10) || 0;
const removed = parts[1] === '-' ? 0 : parseInt(parts[1], 10) || 0;
const pth = parts[2].includes('\t') ? parts[3] : parts[2]; // for rename path\tpath2
const path = parts[3] || parts[2];
churnMap.set(path, { added, removed });
}
}

// === A. build changedFiles FIRST ===
// parse name-status/numstat/etc. (existing code that fills changedFiles)
// ensure you have something like:
// const changedFiles = []; // declare before pushing to it
// ... push { path, status, additions, deletions, ... } into changedFiles

// === B. relevance-driven module docs (SAFE: changedFiles is ready) ===
const modulesDir = join(ctxRoot, 'modules');
let modulesDoc = '';
if (existsSync(modulesDir)) {
// Step 1: module classifier votes. We bump interest tokens with the module
// names so docs such as compiler.md or layers.md get a high score if a file
// belonging to that module changed.
for (const f of changedFiles) {
const cls = classifyModule(f.path);
if (cls && cls.module) {
bumpToken(cls.module, 4 * (cls.weight || 1));
}
}

// Step 2: commit messages occasionally mention the component (“dataset”,
// “optimizers”). We extract keywords to guide doc selection when diff paths
// alone are not conclusive.
for (const subj of commitSubjects) {
const words = subj.toLowerCase().match(/[a-z0-9]{4,}/g) || [];
for (const w of words) bumpToken(w, 0.5);
}

const allMd = readdirSync(modulesDir).filter(f => f.endsWith('.md'));
const docScores = allMd.map(file => {
const slug = normalizeToken(file.replace(/\.md$/, ''));
let score = 0;
if (interestTokens.has(slug)) score += interestTokens.get(slug) * 2;
for (const [token, val] of interestTokens.entries()) {
if (token && slug.includes(token) && token !== slug) score += val;
}
return { file, score };
}).sort((a, b) => b.score - a.score);

const pickedDocs = docScores.filter(d => d.score > 0).map(d => d.file).slice(0, 8);
const fallbackDocs = pickedDocs.length ? pickedDocs : docScores.slice(0, 3).map(d => d.file);

for (const f of fallbackDocs) {
const body = readFileSync(join(modulesDir, f), 'utf8');
modulesDoc += `\n\n## ${f}\n` + clip(body, 6000);
}
}
modulesDoc = clip(modulesDoc, 24000);



function statusWeight(status) {
// Rxxx, Cxxx, etc. are considered renames/copies
if (status.startsWith('R') || status.startsWith('C')) return 2.0;
if (status === 'D') return 2.5;
if (status === 'A') return 1.5;
// Default M
return 1.0;
}

// ---------- 3) Module Grouping + Impact Calculation ----------
const modulesAgg = new Map(); // name -> { files:[], score:0, weight, counts, lines }
const unmatched = [];
for (const f of changedFiles) {
const cls = classifyModule(f.path);
if (!cls || !cls.module) { unmatched.push(f.path); continue; }
const key = cls.module;
if (!modulesAgg.has(key)) {
modulesAgg.set(key, {
files: [],
score: 0,
baseWeight: cls.weight,
count: 0,
adds: 0,
dels: 0,
hasRename: false,
hasDelete: false
});
}
const agg = modulesAgg.get(key);
agg.files.push({ path: f.path, status: f.status });
agg.count += 1;

const churn = churnMap.get(f.path) || { added: 0, removed: 0 };
agg.adds += churn.added;
agg.dels += churn.removed;
if (f.status.startsWith('R') || f.status.startsWith('C')) agg.hasRename = true;
if (f.status === 'D') agg.hasDelete = true;

// score = (module weight) * (status weight) * (size weight)
const sizeFactor = Math.log10(1 + churn.added + churn.removed + 1); // 0~approximately 4
agg.score += cls.weight * statusWeight(f.status) * (1 + sizeFactor);
}

// Final impact level determination by module
function levelFromScore(s) {
if (s >= 30) return 'High';
if (s >= 12) return 'Medium';
return 'Low';
}
function bumpForSignals(agg) {
let bonus = 0;
if (agg.hasDelete) bonus += 2.0;
if (agg.hasRename) bonus += 1.0;
if (agg.count >= 10) bonus += 1.5;
const churn = agg.adds + agg.dels;
if (churn >= 500) bonus += 2.0;
else if (churn >= 200) bonus += 1.0;
return bonus;
}
const moduleImpact = {};
for (const [name, agg] of modulesAgg.entries()) {
const score = agg.score + bumpForSignals(agg);
moduleImpact[name] = {
impact: levelFromScore(score),
score: Math.round(score * 10) / 10,
files: agg.files,
stats: { files: agg.count, added: agg.adds, removed: agg.dels,
rename: agg.hasRename, delete: agg.hasDelete }
};
}
// extra reviewer signals (place AFTER changedFiles built)
function headerOrConfig(p){
return /\.(h|hpp|hh|hxx|inc)$/.test(p) ||
/(^|\/)(CMakeLists\.txt|configure|.*\.cmake|.*\.bazel|build\.gradle|settings\.gradle|package\.json)$/.test(p);
}
const apiSurfaceChanges = changedFiles.filter(f => headerOrConfig(f.path)).map(f => f.path);
const testFiles = changedFiles.filter(f => /(^|\/)(test|tests|testing|spec)\b|_test\.(cc|cpp|c|py|js|ts)$/.test(f.path)).map(f => f.path);
const concurrencySensitive= changedFiles.filter(f => /(thread|mutex|atomic|lock|concurrent|parallel)/i.test(f.path)).map(f => f.path);

// ---------- 4) Diff/Commits Text ----------
const diff = clip(`### name-status\n${nameStatusRaw}\n\n### stat\n${statRaw}`, 8000);

const buckets = { feat:0, fix:0, refactor:0, test:0, docs:0, chore:0, other:0 };
for (const s of commitSubjects) {
const t = s.toLowerCase();
if (t.startsWith('feat')) buckets.feat++;
else if (t.startsWith('fix')) buckets.fix++;
else if (t.startsWith('refactor')) buckets.refactor++;
else if (t.startsWith('test')) buckets.test++;
else if (t.startsWith('docs')) buckets.docs++;
else if (t.startsWith('chore')) buckets.chore++;
else buckets.other++;
}
const commits =
`Total commits: ${commitSubjects.length}\n` +
Object.entries(buckets).map(([k,v]) => `- ${k}: ${v}`).join('\n') +
(commitSubjects.length ? `\n\nSamples:\n- ${commitSubjects.slice(0,5).join('\n- ')}` : '') +
(commitBodies.length ? `\n\nCommit bodies (top, clipped):\n- ${commitBodies.join('\n- ')}` : '');

// ---------- 5) Module Impact Summary Text (Model Hint) ----------
let moduleImpactSummary = '';
if (Object.keys(moduleImpact).length) {
// 영향도 높은 순으로 상위 5개
const top = Object.entries(moduleImpact)
.sort((a,b) => b[1].score - a[1].score)
.slice(0,5);
moduleImpactSummary = top.map(([name, m]) =>
`- ${name}: impact=${m.impact} (score=${m.score}, files=${m.stats.files}, +${m.stats.added}/-${m.stats.removed}${m.stats.rename?', rename':''}${m.stats.delete?', delete':''})`
).join('\n');
}

// ---------- 6) Output ----------
const out = {
overview: clip(overview, 8000),
modules: modulesDoc,
diff,
commits,
// 새 필드들
moduleImpact, // 모듈별 상세(머신 가독)
moduleImpactSummary: moduleImpactSummary || '(no module impact detected)',
reviewerSignals: {
apiSurfaceChanges,
testFiles,
concurrencySensitive
}
};

process.stdout.write(JSON.stringify(out, null, 2));
54 changes: 54 additions & 0 deletions .github/models/pr-desc/context/modules/compiler.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Compiler Module (`nntrainer/compiler`)

## Responsibility

Transforms high-level model descriptions (INI, ONNX, TFLite) into an executable computation graph. Handles:

- Parsing different configuration formats.
- Realization/rewriting passes that insert or rewrite layers and operations.
- Export of compiled graphs back to formats (e.g. TFLite).

## Key components

Typical files:

- `compiler.h`, `compiler_fwd.h` — main compiler interfaces and entry points.
- `interpreter.h` and concrete interpreters:
- `ini_interpreter.*`
- `onnx_interpreter.*`
- Realizer components:
- `*_realizer.*` (e.g. `activation_realizer`, `bn_realizer`, `flatten_realizer`, `input_realizer`).
- `flatbuffer_opnode.*` — representation of operations for flatbuffer export.

## Dependencies and interactions

- Consumes model definitions from `models/` and configuration files.
- Produces graph structures that are consumed by `graph/` and `layers/`.
- Uses `schema/` for serialization formats and `utils/` for property handling/logging.
- Strongly tied to the semantics of `layers/` and `tensor/` (shapes, data formats).

## Typical changes

- Adding support for a new layer or operation to interpreters and realizers.
- Extending INI/ONNX parsing to support new attributes.
- Adjusting passes that insert auxiliary layers (e.g. activation, batch norm).

## Review focus

When files under `nntrainer/compiler/` change, focus on:

- **Semantic correctness**:
- Are the inferred tensor shapes and data formats consistent with `layers/` and `tensor/`?
- Are default properties and initializers aligned with existing behaviour?
- **Backward compatibility**:
- INI or ONNX field changes should not silently break existing configs.
- Versioned schema changes should be handled explicitly.
- **Graph transformations**:
- Realizer passes must preserve numerical semantics and training behaviour.
- Check that helper layers are inserted in the correct order and with correct connections.

## Common pitfalls

- Incomplete handling of corner cases (e.g. scalar/broadcast shapes, dynamic dimensions).
- Forgetting to register new operations in all relevant interpreters and tests.
- Introducing subtle changes in execution order that affect model numerics.
49 changes: 49 additions & 0 deletions .github/models/pr-desc/context/modules/cpu_backend.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Tensor CPU Backend (`nntrainer/tensor/cpu_backend`)

## Responsibility

Implements CPU-optimized math backends and kernels used by the tensor module:

- GEMM and other BLAS-like kernels for float/half/int quantized types.
- Architecture-specific optimizations (ARM NEON, x86, etc.).
- Abstraction of backend selection and dispatch.

## Key components

Representative files:

- `cpu_backend/arm/*` — ARM-specific backends:
- `arm_compute_backend.*` — integration with ARM Compute Library where applicable.
- `hgemm/*` — half-precision GEMM kernels and padding/packing utilities.
- `kai/*` — quantized matmul implementations and related utilities.
- Backend interfaces and utilities described in `cpu_backend/README.md`.

## Dependencies and interactions

- Called by higher-level tensor operations in `nntrainer/tensor/`.
- Used heavily by `layers/` for fully connected, convolution, and other compute-heavy ops.
- Must remain compatible with tensor layouts and quantization schemes.

## Typical changes

- Adding new microkernels for specific shapes or data types.
- Tuning existing kernels for performance on new CPU targets.
- Refactoring backend interfaces for clarity or new features (e.g., quantization support).

## Review focus

For changes in CPU backend:

- **Numerical correctness**:
- Compare against reference implementations for a wide range of sizes and values.
- **Performance**:
- Kernels are hot paths; watch for unnecessary branches, loads, and stores.
- **Portability**:
- Guard architecture-specific code with appropriate feature checks.
- Confirm build still works on non-target architectures.

## Common pitfalls

- Alignment assumptions mismatched with actual allocation.
- Incorrect handling of edge tiles or remainder loops.
- Divergence between reference tensor semantics and backend implementation (e.g., layout or zero-padding assumptions).
Loading
Loading