feat: brain-engine + brain-ui + docs — template full stack standalone

- brain-engine: server, embed, search, RAG, MCP, start.sh (standalone) - brain-ui: source React complète, build.sh, DocsView avec tier colors - docs: 14 pages guides humains (getting-started, architecture, sessions, workflows, agents, vues tier) - brain-compose.yml v0.9.0: tier featured ajouté, sessions/agents par tier, coach_level, API key schema - DISTRIBUTION_CHECKLIST v1.2: brain-engine + brain-ui + docs dans la checklist
2026-03-20 20:25:40 +01:00
parent c249d417f5
commit 8244a07881
93 changed files with 12088 additions and 34 deletions
--- a/brain-engine/embed.py
+++ b/brain-engine/embed.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+brain-engine/embed.py — Pipeline d'embedding BE-2c
+Indexe le corpus brain via Ollama nomic-embed-text → table embeddings dans brain.db
+
+Usage :
+  python3 brain-engine/embed.py                  → index tout le corpus
+  python3 brain-engine/embed.py --dry-run        → liste les chunks sans embed
+  python3 brain-engine/embed.py --file agents/helloWorld.md  → réindexer un fichier
+  python3 brain-engine/embed.py --stats          → stats de l'index actuel
+
+Headless : zéro dépendance Wayland/display.
+OLLAMA_URL : variable d'env (défaut localhost:11434) — supporte réseau local.
+
+Zone filter — ADR-033a (2026-03-18) :
+  kernel  (agents/, wiki/, toolkit/, contexts/, KERNEL.md) → toujours indexé
+  project (projets/, handoffs/, workspace/)                → TTL 60 jours git-based
+  session (claims/)                                        → JAMAIS indexé
+  personal (profil/bact/, profil/collaboration.md)         → JAMAIS indexé
+  profil/decisions/                                        → scope frontmatter (kernel | project)
+
+Stratégie chunking par type :
+  agents/*.md, projets/*.md, wiki/**/*.md  → chunk par section H2
+  workspace/**/*.md, profil/decisions/*.md → H2 ou fichier entier si < 512 tokens
+  KERNEL.md, focus.md, contexts/           → fichier entier (documents courts)
+"""
+
+import os
+import re
+import sys
+import json
+import struct
+import hashlib
+import argparse
+import sqlite3
+import subprocess
+import time
+import urllib.request
+import urllib.error
+from datetime import datetime
+from pathlib import Path
+
+BRAIN_ROOT   = Path(__file__).parent.parent
+DB_PATH      = BRAIN_ROOT / 'brain.db'
+OLLAMA_URL   = os.getenv('OLLAMA_URL', 'http://localhost:11434')
+EMBED_MODEL  = os.getenv('EMBED_MODEL', 'nomic-embed-text')
+
+# Guardrail — LLMs génériques interdits : freeze machine garanti sur corpus entier
+# (validé empiriquement : mistral:7b + qwen3:8b → freeze total ~20min, 2026-03-16)
+_BLOCKED_MODELS = ['mistral', 'qwen', 'llama', 'gemma', 'phi', 'deepseek']
+if any(b in EMBED_MODEL.lower() for b in _BLOCKED_MODELS):
+    sys.exit(f"❌ EMBED_MODEL='{EMBED_MODEL}' interdit — LLM générique → freeze machine sur corpus entier.\n"
+             f"   Utiliser un modèle dédié embedding : nomic-embed-text, mxbai-embed-large, all-minilm")
+
+CHUNK_TOKENS = 512   # tokens max par chunk (approximé : 1 token ≈ 4 chars)
+CHUNK_OVERLAP = 64   # overlap entre chunks consécutifs
+
+# ── Zones d'accès ─────────────────────────────────────────────────────────────
+
+# Zone 0 — jamais indexé (privé absolu) — ADR-033a
+PRIVATE_PATHS = [
+    'profil/capital.md',
+    'profil/objectifs.md',
+    'profil/bact/',           # personal — jamais
+    'profil/collaboration.md',# personal — jamais
+    'progression/',           # personal — journal + tout le répertoire
+    'MYSECRETS',
+]
+
+# Zone par préfixe — premier match gagne — ADR-033a + KERNEL.md zones
+# Zones : kernel | instance | satellite | public  (private = exclusion totale ci-dessus)
+PATH_SCOPES = [
+    # KERNEL — protection maximale
+    ('contexts/',             'kernel'),
+    ('profil/decisions/',     'kernel'),
+    ('profil/',               'kernel'),
+    ('KERNEL.md',             'kernel'),
+    ('brain-constitution.md', 'kernel'),
+    ('scripts/',              'kernel'),
+    # INSTANCE — configuration machine + projets actifs
+    ('focus.md',              'instance'),
+    ('projets/',              'instance'),
+    ('PATHS.md',              'instance'),
+    ('now.md',                'instance'),
+    # SATELLITE — vie libre, promotion possible
+    ('toolkit/',              'satellite'),
+    ('todo/',                 'satellite'),
+    ('workspace/',            'satellite'),
+    ('handoffs/',             'satellite'),
+    ('intentions/',           'satellite'),
+    # PUBLIC — visible, distribué
+    ('wiki/',                 'public'),
+    ('agents/',               'public'),
+    ('infrastructure/',       'public'),
+    ('BRAIN-INDEX.md',        'public'),
+]
+DEFAULT_SCOPE = 'public'
+
+
+TTL_PROJECT_DAYS = 60  # ADR-033a — TTL projet, git-based
+
+
+def is_private(filepath: str) -> bool:
+    """Zone 0 — jamais indexé, jamais accessible."""
+    return any(filepath == p or filepath.startswith(p) for p in PRIVATE_PATHS)
+
+
+def resolve_scope(filepath: str) -> str:
+    """Retourne la zone d'accès (kernel | instance | satellite | public)."""
+    for prefix, scope in PATH_SCOPES:
+        if filepath == prefix or filepath.startswith(prefix):
+            return scope
+    return DEFAULT_SCOPE
+
+
+def get_frontmatter_scope(filepath: Path) -> str | None:
+    """
+    Lit le champ scope: du frontmatter YAML d'un fichier .md.
+    Retourne 'kernel' | 'project' | 'personal' | None si absent.
+    ADR-033a Règle 2 — override sur la règle répertoire.
+    """
+    try:
+        text = filepath.read_text(errors='replace')
+        if not text.startswith('---'):
+            return None
+        end = text.find('\n---', 3)
+        if end == -1:
+            return None
+        for line in text[3:end].splitlines():
+            line = line.strip()
+            if line.startswith('scope:'):
+                val = line[len('scope:'):].strip()
+                val = val.split('#')[0].strip()  # retire commentaires inline
+                return val if val else None
+    except Exception:
+        pass
+    return None
+
+
+def get_git_age_days(filepath: Path) -> int | None:
+    """
+    Retourne le nombre de jours depuis le dernier git commit sur ce fichier.
+    None si le fichier n'est pas tracké ou si git échoue.
+    ADR-033a — TTL git-based, aucun couplage BSI.
+    """
+    try:
+        result = subprocess.run(
+            ['git', 'log', '-1', '--format=%ct', '--', str(filepath)],
+            capture_output=True, text=True, cwd=str(BRAIN_ROOT), timeout=5
+        )
+        ts = result.stdout.strip()
+        if not ts:
+            return None
+        age_secs = time.time() - int(ts)
+        return int(age_secs / 86400)
+    except Exception:
+        return None
+
+
+def should_skip_by_zone(filepath: Path) -> bool:
+    """
+    Applique les règles ADR-033a — retourne True si le fichier doit être exclu.
+
+    Règle 1 — répertoire (défaut)
+    Règle 2 — frontmatter scope: (override sur Règle 1, pour profil/decisions/)
+
+    Zones :
+      kernel               → False (toujours indexé)
+      project + TTL > 60j  → True  (périmé)
+      personal             → True  (jamais)
+    """
+    rel = str(filepath.relative_to(BRAIN_ROOT))
+
+    # profil/decisions/ — Règle 2 : scope par frontmatter
+    if rel.startswith('profil/decisions/'):
+        scope = get_frontmatter_scope(filepath)
+        if scope == 'personal':
+            return True
+        if scope == 'project':
+            age = get_git_age_days(filepath)
+            return age is not None and age > TTL_PROJECT_DAYS
+        # scope: kernel ou absent → toujours indexé
+        return False
+
+    # Zone project — TTL git-based
+    if any(rel.startswith(p) for p in ('projets/', 'handoffs/', 'workspace/')):
+        age = get_git_age_days(filepath)
+        return age is not None and age > TTL_PROJECT_DAYS
+
+    return False
+
+
+# Corpus à indexer — chemins relatifs à BRAIN_ROOT — ADR-033a
+# kernel → toujours  |  project → TTL 60j git  |  omis → JAMAIS
+CORPUS_PATHS = [
+    # ── kernel — toujours indexé ──────────────────────────────────────────────
+    ('agents',           '*.md',    'h2'),    # agents brain
+    ('wiki',             '**/*.md', 'h2'),    # documentation (submodule)
+    ('toolkit',          '**/*.md', 'h2'),    # patterns réutilisables
+    ('contexts',         '*.yml',   'file'),  # contextes de session
+    # ── project — TTL 60 jours git-based ─────────────────────────────────────
+    ('projets',          '*.md',    'h2'),
+    ('handoffs',         '*.md',    'file'),
+    ('workspace',        '**/*.md', 'h2'),
+    # ── profil/decisions — scope par frontmatter (kernel | project) ──────────
+    ('profil/decisions', '*.md',    'file'),
+    # ── fichiers racine kernel ────────────────────────────────────────────────
+    ('.',                'KERNEL.md',      'file'),
+    ('.',                'focus.md',       'file'),
+    ('.',                'BRAIN-INDEX.md', 'file'),
+    # SUPPRIMÉ : ('ADR', ...) — chemin obsolète (ADRs dans profil/decisions/)
+    # SUPPRIMÉ : ('profil', ...) — trop large, inclut bact/ — géré par scope
+    # SUPPRIMÉ : ('claims', ...) — JAMAIS indexé per ADR-033a (session structurée)
+]
+
+# Fichiers à exclure
+EXCLUDE_PATTERNS = [
+    'brain-template/',
+    'brain-engine/',
+    '.git/',
+    'node_modules/',
+]
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def should_exclude(filepath: Path) -> bool:
+    s = str(filepath)
+    if any(p in s for p in EXCLUDE_PATTERNS):
+        return True
+    # Zone 0 — privé absolu, jamais indexé
+    if filepath.is_absolute():
+        try:
+            rel = str(filepath.relative_to(BRAIN_ROOT))
+        except ValueError:
+            rel = s  # path hors BRAIN_ROOT — is_private unlikely mais safe
+    else:
+        rel = s
+    return is_private(rel)
+
+
+def chunk_by_h2(text: str, filepath: str) -> list[dict]:
+    """Découpe un markdown en chunks par section H2."""
+    sections = re.split(r'\n(?=## )', text)
+    chunks = []
+    for sec in sections:
+        sec = sec.strip()
+        if not sec:
+            continue
+        # Si section trop longue → re-découper par paragraphes
+        if len(sec) > CHUNK_TOKENS * 4:
+            sub = chunk_by_size(sec, filepath)
+            chunks.extend(sub)
+        else:
+            title = sec.split('\n')[0].strip('#').strip()
+            chunks.append({'text': sec, 'title': title, 'filepath': filepath})
+    return chunks if chunks else [{'text': text, 'title': '', 'filepath': filepath}]
+
+
+def chunk_by_size(text: str, filepath: str) -> list[dict]:
+    """Découpe un texte en chunks de CHUNK_TOKENS tokens (approx)."""
+    max_chars = CHUNK_TOKENS * 4
+    overlap_chars = CHUNK_OVERLAP * 4
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = min(start + max_chars, len(text))
+        # Couper sur un saut de ligne si possible
+        if end < len(text):
+            nl = text.rfind('\n', start, end)
+            if nl > start:
+                end = nl
+        chunk_text = text[start:end].strip()
+        if chunk_text:
+            chunks.append({'text': chunk_text, 'title': '', 'filepath': filepath})
+        if end >= len(text):
+            break
+        # Toujours avancer : si l'overlap remonterait avant start, aller à end
+        next_start = end - overlap_chars
+        start = next_start if next_start > start else end
+    return chunks
+
+
+def chunk_file(filepath: Path, strategy: str) -> list[dict]:
+    """Lit un fichier et retourne ses chunks selon la stratégie."""
+    try:
+        text = filepath.read_text(errors='replace').strip()
+    except Exception as e:
+        print(f"  ⚠️  {filepath.name} : erreur lecture — {e}")
+        return []
+
+    if not text:
+        return []
+
+    rel = str(filepath.relative_to(BRAIN_ROOT))
+
+    if strategy == 'h2':
+        return chunk_by_h2(text, rel)
+    else:
+        # Fichier entier — si trop long, chunk par taille
+        if len(text) > CHUNK_TOKENS * 4:
+            return chunk_by_size(text, rel)
+        title = filepath.stem
+        return [{'text': text, 'title': title, 'filepath': rel}]
+
+
+def chunk_id(filepath: str, text: str) -> str:
+    """ID déterministe : hash(filepath + text[:64])."""
+    h = hashlib.sha1(f"{filepath}::{text[:64]}".encode()).hexdigest()[:12]
+    return f"emb-{h}"
+
+
+# ── Ollama API ────────────────────────────────────────────────────────────────
+
+def get_embedding(text: str) -> list[float] | None:
+    """Appelle Ollama embeddings API — retourne None si indisponible."""
+    url = f"{OLLAMA_URL}/api/embeddings"
+    payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
+    req = urllib.request.Request(url, data=payload,
+                                  headers={"Content-Type": "application/json"})
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            data = json.loads(resp.read())
+            return data.get('embedding')
+    except (urllib.error.URLError, TimeoutError) as e:
+        print(f"  ⚠️  Ollama indisponible ({OLLAMA_URL}) : {e}")
+        return None
+
+
+def vector_to_blob(vec: list[float]) -> bytes:
+    """Sérialise un vecteur float32 en BLOB SQLite."""
+    return struct.pack(f'{len(vec)}f', *vec)
+
+
+def blob_to_vector(blob: bytes) -> list[float]:
+    """Désérialise un BLOB SQLite en vecteur float32."""
+    n = len(blob) // 4
+    return list(struct.unpack(f'{n}f', blob))
+
+
+# ── SQLite ────────────────────────────────────────────────────────────────────
+
+def connect() -> sqlite3.Connection:
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    # Créer la table embeddings si absente (extend schema)
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS embeddings (
+            chunk_id    TEXT PRIMARY KEY,
+            filepath    TEXT NOT NULL,
+            title       TEXT,
+            chunk_text  TEXT NOT NULL,
+            vector      BLOB,               -- NULL si Ollama indisponible au moment du chunk
+            model       TEXT,
+            indexed     INTEGER DEFAULT 0,  -- 1 = vecteur présent
+            scope       TEXT NOT NULL DEFAULT 'work',  -- kernel | instance | satellite | public
+            created_at  TEXT NOT NULL DEFAULT (datetime('now')),
+            updated_at  TEXT NOT NULL DEFAULT (datetime('now'))
+        )
+    """)
+    # Migration — ajouter scope si absente (db existante avant BE-4)
+    try:
+        conn.execute("ALTER TABLE embeddings ADD COLUMN scope TEXT NOT NULL DEFAULT 'work'")
+        conn.commit()
+        # Backfill — résoudre le scope de chaque chunk existant depuis son filepath
+        rows = conn.execute("SELECT DISTINCT filepath FROM embeddings WHERE scope = 'work'").fetchall()
+        for row in rows:
+            fp = row['filepath']
+            s  = resolve_scope(fp)
+            if s != 'work':
+                conn.execute("UPDATE embeddings SET scope = ? WHERE filepath = ?", (s, fp))
+        conn.commit()
+    except Exception:
+        pass  # colonne déjà présente
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_emb_filepath ON embeddings(filepath)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_emb_indexed ON embeddings(indexed)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_emb_scope ON embeddings(scope)")
+    conn.commit()
+    return conn
+
+
+def upsert_chunk(conn: sqlite3.Connection, chunk: dict,
+                 vector: list[float] | None, dry_run: bool = False) -> bool:
+    cid     = chunk_id(chunk['filepath'], chunk['text'])
+    blob    = vector_to_blob(vector) if vector else None
+    indexed = 1 if vector else 0
+    scope   = chunk.get('scope', resolve_scope(chunk['filepath']))
+
+    if dry_run:
+        return True
+
+    conn.execute("""
+        INSERT INTO embeddings(chunk_id, filepath, title, chunk_text, vector, model, indexed, scope, updated_at)
+        VALUES (?,?,?,?,?,?,?,?, datetime('now'))
+        ON CONFLICT(chunk_id) DO UPDATE SET
+            chunk_text = excluded.chunk_text,
+            vector     = COALESCE(excluded.vector, embeddings.vector),
+            indexed    = COALESCE(excluded.indexed, embeddings.indexed),
+            scope      = excluded.scope,
+            updated_at = excluded.updated_at
+    """, (cid, chunk['filepath'], chunk.get('title',''), chunk['text'],
+          blob, EMBED_MODEL if vector else None, indexed, scope))
+    return True
+
+
+# ── Pipeline principal ────────────────────────────────────────────────────────
+
+def collect_files(target_file: str | None = None) -> list[tuple[Path, str]]:
+    """Retourne la liste (path, strategy) des fichiers à indexer."""
+    files = []
+    seen = set()
+
+    if target_file:
+        p = (BRAIN_ROOT / target_file).resolve()
+        if not str(p).startswith(str(BRAIN_ROOT.resolve())):
+            print(f"  🚨 --file hors BRAIN_ROOT refusé : {p}")
+            return files
+        if p.exists():
+            # Déterminer stratégie par répertoire
+            for base, pattern, strategy in CORPUS_PATHS:
+                if str(p).startswith(str(BRAIN_ROOT / base)):
+                    files.append((p, strategy))
+                    break
+            else:
+                files.append((p, 'h2'))
+        return files
+
+    for base, pattern, strategy in CORPUS_PATHS:
+        base_path = BRAIN_ROOT / base
+        if not base_path.exists():
+            continue
+        for p in sorted(base_path.glob(pattern)):
+            if p in seen or not p.is_file():
+                continue
+            if should_exclude(p):
+                continue
+            if should_skip_by_zone(p):
+                continue
+            seen.add(p)
+            files.append((p, strategy))
+
+    return files
+
+
+def run(dry_run: bool = False, target_file: str | None = None,
+        stats_only: bool = False):
+
+    conn = connect()
+
+    if stats_only:
+        total   = conn.execute("SELECT COUNT(*) FROM embeddings").fetchone()[0]
+        indexed = conn.execute("SELECT COUNT(*) FROM embeddings WHERE indexed=1").fetchone()[0]
+        pending = total - indexed
+        files_n = conn.execute("SELECT COUNT(DISTINCT filepath) FROM embeddings").fetchone()[0]
+        print(f"Index embeddings :")
+        print(f"  chunks total  : {total}")
+        print(f"  indexés       : {indexed}  ({100*indexed//total if total else 0}%)")
+        print(f"  sans vecteur  : {pending}")
+        print(f"  fichiers      : {files_n}")
+        print(f"  modèle        : {EMBED_MODEL} @ {OLLAMA_URL}")
+        conn.close()
+        return
+
+    files = collect_files(target_file)
+    print(f"Corpus : {len(files)} fichier(s) — modèle {EMBED_MODEL} @ {OLLAMA_URL}")
+
+    # Tester Ollama avant de boucler
+    test_vec = get_embedding("test connexion") if not dry_run else None
+    ollama_ok = test_vec is not None
+    if not ollama_ok and not dry_run:
+        print(f"  ⚠️  Ollama indisponible — chunks enregistrés sans vecteur (indexed=0)")
+
+    total_chunks = 0
+    total_indexed = 0
+
+    for filepath, strategy in files:
+        chunks = chunk_file(filepath, strategy)
+        if not chunks:
+            continue
+
+        file_chunks = 0
+        for chunk in chunks:
+            chunk['scope'] = resolve_scope(chunk['filepath'])
+            vec = None
+            if ollama_ok and not dry_run:
+                vec = get_embedding(chunk['text'])
+                if vec:
+                    total_indexed += 1
+
+            upsert_chunk(conn, chunk, vec, dry_run=dry_run)
+            total_chunks += 1
+            file_chunks += 1
+
+        rel = str(filepath.relative_to(BRAIN_ROOT))
+        status = "✅" if ollama_ok else "⬜"
+        print(f"  {status} {rel} — {file_chunks} chunk(s)")
+
+    if not dry_run:
+        conn.commit()
+
+    print(f"\n{'[dry] ' if dry_run else ''}Chunks traités : {total_chunks}")
+    if not dry_run:
+        print(f"Vecteurs générés : {total_indexed}")
+        if not ollama_ok:
+            print(f"⚠️  Relancer avec Ollama actif pour compléter l'index")
+
+    conn.close()
+
+
+# ── CLI ───────────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description='brain-engine embed — pipeline embeddings BE-2c')
+    parser.add_argument('--dry-run',  action='store_true', help='Liste les chunks sans embed')
+    parser.add_argument('--file',     metavar='PATH',      help='Réindexer un fichier spécifique')
+    parser.add_argument('--stats',    action='store_true', help='Stats de l\'index actuel')
+    args = parser.parse_args()
+
+    run(dry_run=args.dry_run, target_file=args.file, stats_only=args.stats)
+
+
+if __name__ == '__main__':
+    main()