feat: brain-engine + brain-ui + docs — template full stack standalone
- brain-engine: server, embed, search, RAG, MCP, start.sh (standalone) - brain-ui: source React complète, build.sh, DocsView avec tier colors - docs: 14 pages guides humains (getting-started, architecture, sessions, workflows, agents, vues tier) - brain-compose.yml v0.9.0: tier featured ajouté, sessions/agents par tier, coach_level, API key schema - DISTRIBUTION_CHECKLIST v1.2: brain-engine + brain-ui + docs dans la checklist
This commit is contained in:
524
brain-engine/embed.py
Normal file
524
brain-engine/embed.py
Normal file
@@ -0,0 +1,524 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
brain-engine/embed.py — Pipeline d'embedding BE-2c
|
||||
Indexe le corpus brain via Ollama nomic-embed-text → table embeddings dans brain.db
|
||||
|
||||
Usage :
|
||||
python3 brain-engine/embed.py → index tout le corpus
|
||||
python3 brain-engine/embed.py --dry-run → liste les chunks sans embed
|
||||
python3 brain-engine/embed.py --file agents/helloWorld.md → réindexer un fichier
|
||||
python3 brain-engine/embed.py --stats → stats de l'index actuel
|
||||
|
||||
Headless : zéro dépendance Wayland/display.
|
||||
OLLAMA_URL : variable d'env (défaut localhost:11434) — supporte réseau local.
|
||||
|
||||
Zone filter — ADR-033a (2026-03-18) :
|
||||
kernel (agents/, wiki/, toolkit/, contexts/, KERNEL.md) → toujours indexé
|
||||
project (projets/, handoffs/, workspace/) → TTL 60 jours git-based
|
||||
session (claims/) → JAMAIS indexé
|
||||
personal (profil/bact/, profil/collaboration.md) → JAMAIS indexé
|
||||
profil/decisions/ → scope frontmatter (kernel | project)
|
||||
|
||||
Stratégie chunking par type :
|
||||
agents/*.md, projets/*.md, wiki/**/*.md → chunk par section H2
|
||||
workspace/**/*.md, profil/decisions/*.md → H2 ou fichier entier si < 512 tokens
|
||||
KERNEL.md, focus.md, contexts/ → fichier entier (documents courts)
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import struct
|
||||
import hashlib
|
||||
import argparse
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
BRAIN_ROOT = Path(__file__).parent.parent
|
||||
DB_PATH = BRAIN_ROOT / 'brain.db'
|
||||
OLLAMA_URL = os.getenv('OLLAMA_URL', 'http://localhost:11434')
|
||||
EMBED_MODEL = os.getenv('EMBED_MODEL', 'nomic-embed-text')
|
||||
|
||||
# Guardrail — LLMs génériques interdits : freeze machine garanti sur corpus entier
|
||||
# (validé empiriquement : mistral:7b + qwen3:8b → freeze total ~20min, 2026-03-16)
|
||||
_BLOCKED_MODELS = ['mistral', 'qwen', 'llama', 'gemma', 'phi', 'deepseek']
|
||||
if any(b in EMBED_MODEL.lower() for b in _BLOCKED_MODELS):
|
||||
sys.exit(f"❌ EMBED_MODEL='{EMBED_MODEL}' interdit — LLM générique → freeze machine sur corpus entier.\n"
|
||||
f" Utiliser un modèle dédié embedding : nomic-embed-text, mxbai-embed-large, all-minilm")
|
||||
|
||||
CHUNK_TOKENS = 512 # tokens max par chunk (approximé : 1 token ≈ 4 chars)
|
||||
CHUNK_OVERLAP = 64 # overlap entre chunks consécutifs
|
||||
|
||||
# ── Zones d'accès ─────────────────────────────────────────────────────────────
|
||||
|
||||
# Zone 0 — jamais indexé (privé absolu) — ADR-033a
|
||||
PRIVATE_PATHS = [
|
||||
'profil/capital.md',
|
||||
'profil/objectifs.md',
|
||||
'profil/bact/', # personal — jamais
|
||||
'profil/collaboration.md',# personal — jamais
|
||||
'progression/', # personal — journal + tout le répertoire
|
||||
'MYSECRETS',
|
||||
]
|
||||
|
||||
# Zone par préfixe — premier match gagne — ADR-033a + KERNEL.md zones
|
||||
# Zones : kernel | instance | satellite | public (private = exclusion totale ci-dessus)
|
||||
PATH_SCOPES = [
|
||||
# KERNEL — protection maximale
|
||||
('contexts/', 'kernel'),
|
||||
('profil/decisions/', 'kernel'),
|
||||
('profil/', 'kernel'),
|
||||
('KERNEL.md', 'kernel'),
|
||||
('brain-constitution.md', 'kernel'),
|
||||
('scripts/', 'kernel'),
|
||||
# INSTANCE — configuration machine + projets actifs
|
||||
('focus.md', 'instance'),
|
||||
('projets/', 'instance'),
|
||||
('PATHS.md', 'instance'),
|
||||
('now.md', 'instance'),
|
||||
# SATELLITE — vie libre, promotion possible
|
||||
('toolkit/', 'satellite'),
|
||||
('todo/', 'satellite'),
|
||||
('workspace/', 'satellite'),
|
||||
('handoffs/', 'satellite'),
|
||||
('intentions/', 'satellite'),
|
||||
# PUBLIC — visible, distribué
|
||||
('wiki/', 'public'),
|
||||
('agents/', 'public'),
|
||||
('infrastructure/', 'public'),
|
||||
('BRAIN-INDEX.md', 'public'),
|
||||
]
|
||||
DEFAULT_SCOPE = 'public'
|
||||
|
||||
|
||||
TTL_PROJECT_DAYS = 60 # ADR-033a — TTL projet, git-based
|
||||
|
||||
|
||||
def is_private(filepath: str) -> bool:
|
||||
"""Zone 0 — jamais indexé, jamais accessible."""
|
||||
return any(filepath == p or filepath.startswith(p) for p in PRIVATE_PATHS)
|
||||
|
||||
|
||||
def resolve_scope(filepath: str) -> str:
|
||||
"""Retourne la zone d'accès (kernel | instance | satellite | public)."""
|
||||
for prefix, scope in PATH_SCOPES:
|
||||
if filepath == prefix or filepath.startswith(prefix):
|
||||
return scope
|
||||
return DEFAULT_SCOPE
|
||||
|
||||
|
||||
def get_frontmatter_scope(filepath: Path) -> str | None:
|
||||
"""
|
||||
Lit le champ scope: du frontmatter YAML d'un fichier .md.
|
||||
Retourne 'kernel' | 'project' | 'personal' | None si absent.
|
||||
ADR-033a Règle 2 — override sur la règle répertoire.
|
||||
"""
|
||||
try:
|
||||
text = filepath.read_text(errors='replace')
|
||||
if not text.startswith('---'):
|
||||
return None
|
||||
end = text.find('\n---', 3)
|
||||
if end == -1:
|
||||
return None
|
||||
for line in text[3:end].splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith('scope:'):
|
||||
val = line[len('scope:'):].strip()
|
||||
val = val.split('#')[0].strip() # retire commentaires inline
|
||||
return val if val else None
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_git_age_days(filepath: Path) -> int | None:
|
||||
"""
|
||||
Retourne le nombre de jours depuis le dernier git commit sur ce fichier.
|
||||
None si le fichier n'est pas tracké ou si git échoue.
|
||||
ADR-033a — TTL git-based, aucun couplage BSI.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['git', 'log', '-1', '--format=%ct', '--', str(filepath)],
|
||||
capture_output=True, text=True, cwd=str(BRAIN_ROOT), timeout=5
|
||||
)
|
||||
ts = result.stdout.strip()
|
||||
if not ts:
|
||||
return None
|
||||
age_secs = time.time() - int(ts)
|
||||
return int(age_secs / 86400)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def should_skip_by_zone(filepath: Path) -> bool:
|
||||
"""
|
||||
Applique les règles ADR-033a — retourne True si le fichier doit être exclu.
|
||||
|
||||
Règle 1 — répertoire (défaut)
|
||||
Règle 2 — frontmatter scope: (override sur Règle 1, pour profil/decisions/)
|
||||
|
||||
Zones :
|
||||
kernel → False (toujours indexé)
|
||||
project + TTL > 60j → True (périmé)
|
||||
personal → True (jamais)
|
||||
"""
|
||||
rel = str(filepath.relative_to(BRAIN_ROOT))
|
||||
|
||||
# profil/decisions/ — Règle 2 : scope par frontmatter
|
||||
if rel.startswith('profil/decisions/'):
|
||||
scope = get_frontmatter_scope(filepath)
|
||||
if scope == 'personal':
|
||||
return True
|
||||
if scope == 'project':
|
||||
age = get_git_age_days(filepath)
|
||||
return age is not None and age > TTL_PROJECT_DAYS
|
||||
# scope: kernel ou absent → toujours indexé
|
||||
return False
|
||||
|
||||
# Zone project — TTL git-based
|
||||
if any(rel.startswith(p) for p in ('projets/', 'handoffs/', 'workspace/')):
|
||||
age = get_git_age_days(filepath)
|
||||
return age is not None and age > TTL_PROJECT_DAYS
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# Corpus à indexer — chemins relatifs à BRAIN_ROOT — ADR-033a
|
||||
# kernel → toujours | project → TTL 60j git | omis → JAMAIS
|
||||
CORPUS_PATHS = [
|
||||
# ── kernel — toujours indexé ──────────────────────────────────────────────
|
||||
('agents', '*.md', 'h2'), # agents brain
|
||||
('wiki', '**/*.md', 'h2'), # documentation (submodule)
|
||||
('toolkit', '**/*.md', 'h2'), # patterns réutilisables
|
||||
('contexts', '*.yml', 'file'), # contextes de session
|
||||
# ── project — TTL 60 jours git-based ─────────────────────────────────────
|
||||
('projets', '*.md', 'h2'),
|
||||
('handoffs', '*.md', 'file'),
|
||||
('workspace', '**/*.md', 'h2'),
|
||||
# ── profil/decisions — scope par frontmatter (kernel | project) ──────────
|
||||
('profil/decisions', '*.md', 'file'),
|
||||
# ── fichiers racine kernel ────────────────────────────────────────────────
|
||||
('.', 'KERNEL.md', 'file'),
|
||||
('.', 'focus.md', 'file'),
|
||||
('.', 'BRAIN-INDEX.md', 'file'),
|
||||
# SUPPRIMÉ : ('ADR', ...) — chemin obsolète (ADRs dans profil/decisions/)
|
||||
# SUPPRIMÉ : ('profil', ...) — trop large, inclut bact/ — géré par scope
|
||||
# SUPPRIMÉ : ('claims', ...) — JAMAIS indexé per ADR-033a (session structurée)
|
||||
]
|
||||
|
||||
# Fichiers à exclure
|
||||
EXCLUDE_PATTERNS = [
|
||||
'brain-template/',
|
||||
'brain-engine/',
|
||||
'.git/',
|
||||
'node_modules/',
|
||||
]
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def should_exclude(filepath: Path) -> bool:
|
||||
s = str(filepath)
|
||||
if any(p in s for p in EXCLUDE_PATTERNS):
|
||||
return True
|
||||
# Zone 0 — privé absolu, jamais indexé
|
||||
if filepath.is_absolute():
|
||||
try:
|
||||
rel = str(filepath.relative_to(BRAIN_ROOT))
|
||||
except ValueError:
|
||||
rel = s # path hors BRAIN_ROOT — is_private unlikely mais safe
|
||||
else:
|
||||
rel = s
|
||||
return is_private(rel)
|
||||
|
||||
|
||||
def chunk_by_h2(text: str, filepath: str) -> list[dict]:
|
||||
"""Découpe un markdown en chunks par section H2."""
|
||||
sections = re.split(r'\n(?=## )', text)
|
||||
chunks = []
|
||||
for sec in sections:
|
||||
sec = sec.strip()
|
||||
if not sec:
|
||||
continue
|
||||
# Si section trop longue → re-découper par paragraphes
|
||||
if len(sec) > CHUNK_TOKENS * 4:
|
||||
sub = chunk_by_size(sec, filepath)
|
||||
chunks.extend(sub)
|
||||
else:
|
||||
title = sec.split('\n')[0].strip('#').strip()
|
||||
chunks.append({'text': sec, 'title': title, 'filepath': filepath})
|
||||
return chunks if chunks else [{'text': text, 'title': '', 'filepath': filepath}]
|
||||
|
||||
|
||||
def chunk_by_size(text: str, filepath: str) -> list[dict]:
|
||||
"""Découpe un texte en chunks de CHUNK_TOKENS tokens (approx)."""
|
||||
max_chars = CHUNK_TOKENS * 4
|
||||
overlap_chars = CHUNK_OVERLAP * 4
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_chars, len(text))
|
||||
# Couper sur un saut de ligne si possible
|
||||
if end < len(text):
|
||||
nl = text.rfind('\n', start, end)
|
||||
if nl > start:
|
||||
end = nl
|
||||
chunk_text = text[start:end].strip()
|
||||
if chunk_text:
|
||||
chunks.append({'text': chunk_text, 'title': '', 'filepath': filepath})
|
||||
if end >= len(text):
|
||||
break
|
||||
# Toujours avancer : si l'overlap remonterait avant start, aller à end
|
||||
next_start = end - overlap_chars
|
||||
start = next_start if next_start > start else end
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_file(filepath: Path, strategy: str) -> list[dict]:
|
||||
"""Lit un fichier et retourne ses chunks selon la stratégie."""
|
||||
try:
|
||||
text = filepath.read_text(errors='replace').strip()
|
||||
except Exception as e:
|
||||
print(f" ⚠️ {filepath.name} : erreur lecture — {e}")
|
||||
return []
|
||||
|
||||
if not text:
|
||||
return []
|
||||
|
||||
rel = str(filepath.relative_to(BRAIN_ROOT))
|
||||
|
||||
if strategy == 'h2':
|
||||
return chunk_by_h2(text, rel)
|
||||
else:
|
||||
# Fichier entier — si trop long, chunk par taille
|
||||
if len(text) > CHUNK_TOKENS * 4:
|
||||
return chunk_by_size(text, rel)
|
||||
title = filepath.stem
|
||||
return [{'text': text, 'title': title, 'filepath': rel}]
|
||||
|
||||
|
||||
def chunk_id(filepath: str, text: str) -> str:
|
||||
"""ID déterministe : hash(filepath + text[:64])."""
|
||||
h = hashlib.sha1(f"{filepath}::{text[:64]}".encode()).hexdigest()[:12]
|
||||
return f"emb-{h}"
|
||||
|
||||
|
||||
# ── Ollama API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_embedding(text: str) -> list[float] | None:
|
||||
"""Appelle Ollama embeddings API — retourne None si indisponible."""
|
||||
url = f"{OLLAMA_URL}/api/embeddings"
|
||||
payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
|
||||
req = urllib.request.Request(url, data=payload,
|
||||
headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get('embedding')
|
||||
except (urllib.error.URLError, TimeoutError) as e:
|
||||
print(f" ⚠️ Ollama indisponible ({OLLAMA_URL}) : {e}")
|
||||
return None
|
||||
|
||||
|
||||
def vector_to_blob(vec: list[float]) -> bytes:
|
||||
"""Sérialise un vecteur float32 en BLOB SQLite."""
|
||||
return struct.pack(f'{len(vec)}f', *vec)
|
||||
|
||||
|
||||
def blob_to_vector(blob: bytes) -> list[float]:
|
||||
"""Désérialise un BLOB SQLite en vecteur float32."""
|
||||
n = len(blob) // 4
|
||||
return list(struct.unpack(f'{n}f', blob))
|
||||
|
||||
|
||||
# ── SQLite ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def connect() -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
# Créer la table embeddings si absente (extend schema)
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS embeddings (
|
||||
chunk_id TEXT PRIMARY KEY,
|
||||
filepath TEXT NOT NULL,
|
||||
title TEXT,
|
||||
chunk_text TEXT NOT NULL,
|
||||
vector BLOB, -- NULL si Ollama indisponible au moment du chunk
|
||||
model TEXT,
|
||||
indexed INTEGER DEFAULT 0, -- 1 = vecteur présent
|
||||
scope TEXT NOT NULL DEFAULT 'work', -- kernel | instance | satellite | public
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
# Migration — ajouter scope si absente (db existante avant BE-4)
|
||||
try:
|
||||
conn.execute("ALTER TABLE embeddings ADD COLUMN scope TEXT NOT NULL DEFAULT 'work'")
|
||||
conn.commit()
|
||||
# Backfill — résoudre le scope de chaque chunk existant depuis son filepath
|
||||
rows = conn.execute("SELECT DISTINCT filepath FROM embeddings WHERE scope = 'work'").fetchall()
|
||||
for row in rows:
|
||||
fp = row['filepath']
|
||||
s = resolve_scope(fp)
|
||||
if s != 'work':
|
||||
conn.execute("UPDATE embeddings SET scope = ? WHERE filepath = ?", (s, fp))
|
||||
conn.commit()
|
||||
except Exception:
|
||||
pass # colonne déjà présente
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_emb_filepath ON embeddings(filepath)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_emb_indexed ON embeddings(indexed)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_emb_scope ON embeddings(scope)")
|
||||
conn.commit()
|
||||
return conn
|
||||
|
||||
|
||||
def upsert_chunk(conn: sqlite3.Connection, chunk: dict,
|
||||
vector: list[float] | None, dry_run: bool = False) -> bool:
|
||||
cid = chunk_id(chunk['filepath'], chunk['text'])
|
||||
blob = vector_to_blob(vector) if vector else None
|
||||
indexed = 1 if vector else 0
|
||||
scope = chunk.get('scope', resolve_scope(chunk['filepath']))
|
||||
|
||||
if dry_run:
|
||||
return True
|
||||
|
||||
conn.execute("""
|
||||
INSERT INTO embeddings(chunk_id, filepath, title, chunk_text, vector, model, indexed, scope, updated_at)
|
||||
VALUES (?,?,?,?,?,?,?,?, datetime('now'))
|
||||
ON CONFLICT(chunk_id) DO UPDATE SET
|
||||
chunk_text = excluded.chunk_text,
|
||||
vector = COALESCE(excluded.vector, embeddings.vector),
|
||||
indexed = COALESCE(excluded.indexed, embeddings.indexed),
|
||||
scope = excluded.scope,
|
||||
updated_at = excluded.updated_at
|
||||
""", (cid, chunk['filepath'], chunk.get('title',''), chunk['text'],
|
||||
blob, EMBED_MODEL if vector else None, indexed, scope))
|
||||
return True
|
||||
|
||||
|
||||
# ── Pipeline principal ────────────────────────────────────────────────────────
|
||||
|
||||
def collect_files(target_file: str | None = None) -> list[tuple[Path, str]]:
|
||||
"""Retourne la liste (path, strategy) des fichiers à indexer."""
|
||||
files = []
|
||||
seen = set()
|
||||
|
||||
if target_file:
|
||||
p = (BRAIN_ROOT / target_file).resolve()
|
||||
if not str(p).startswith(str(BRAIN_ROOT.resolve())):
|
||||
print(f" 🚨 --file hors BRAIN_ROOT refusé : {p}")
|
||||
return files
|
||||
if p.exists():
|
||||
# Déterminer stratégie par répertoire
|
||||
for base, pattern, strategy in CORPUS_PATHS:
|
||||
if str(p).startswith(str(BRAIN_ROOT / base)):
|
||||
files.append((p, strategy))
|
||||
break
|
||||
else:
|
||||
files.append((p, 'h2'))
|
||||
return files
|
||||
|
||||
for base, pattern, strategy in CORPUS_PATHS:
|
||||
base_path = BRAIN_ROOT / base
|
||||
if not base_path.exists():
|
||||
continue
|
||||
for p in sorted(base_path.glob(pattern)):
|
||||
if p in seen or not p.is_file():
|
||||
continue
|
||||
if should_exclude(p):
|
||||
continue
|
||||
if should_skip_by_zone(p):
|
||||
continue
|
||||
seen.add(p)
|
||||
files.append((p, strategy))
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def run(dry_run: bool = False, target_file: str | None = None,
|
||||
stats_only: bool = False):
|
||||
|
||||
conn = connect()
|
||||
|
||||
if stats_only:
|
||||
total = conn.execute("SELECT COUNT(*) FROM embeddings").fetchone()[0]
|
||||
indexed = conn.execute("SELECT COUNT(*) FROM embeddings WHERE indexed=1").fetchone()[0]
|
||||
pending = total - indexed
|
||||
files_n = conn.execute("SELECT COUNT(DISTINCT filepath) FROM embeddings").fetchone()[0]
|
||||
print(f"Index embeddings :")
|
||||
print(f" chunks total : {total}")
|
||||
print(f" indexés : {indexed} ({100*indexed//total if total else 0}%)")
|
||||
print(f" sans vecteur : {pending}")
|
||||
print(f" fichiers : {files_n}")
|
||||
print(f" modèle : {EMBED_MODEL} @ {OLLAMA_URL}")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
files = collect_files(target_file)
|
||||
print(f"Corpus : {len(files)} fichier(s) — modèle {EMBED_MODEL} @ {OLLAMA_URL}")
|
||||
|
||||
# Tester Ollama avant de boucler
|
||||
test_vec = get_embedding("test connexion") if not dry_run else None
|
||||
ollama_ok = test_vec is not None
|
||||
if not ollama_ok and not dry_run:
|
||||
print(f" ⚠️ Ollama indisponible — chunks enregistrés sans vecteur (indexed=0)")
|
||||
|
||||
total_chunks = 0
|
||||
total_indexed = 0
|
||||
|
||||
for filepath, strategy in files:
|
||||
chunks = chunk_file(filepath, strategy)
|
||||
if not chunks:
|
||||
continue
|
||||
|
||||
file_chunks = 0
|
||||
for chunk in chunks:
|
||||
chunk['scope'] = resolve_scope(chunk['filepath'])
|
||||
vec = None
|
||||
if ollama_ok and not dry_run:
|
||||
vec = get_embedding(chunk['text'])
|
||||
if vec:
|
||||
total_indexed += 1
|
||||
|
||||
upsert_chunk(conn, chunk, vec, dry_run=dry_run)
|
||||
total_chunks += 1
|
||||
file_chunks += 1
|
||||
|
||||
rel = str(filepath.relative_to(BRAIN_ROOT))
|
||||
status = "✅" if ollama_ok else "⬜"
|
||||
print(f" {status} {rel} — {file_chunks} chunk(s)")
|
||||
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
|
||||
print(f"\n{'[dry] ' if dry_run else ''}Chunks traités : {total_chunks}")
|
||||
if not dry_run:
|
||||
print(f"Vecteurs générés : {total_indexed}")
|
||||
if not ollama_ok:
|
||||
print(f"⚠️ Relancer avec Ollama actif pour compléter l'index")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='brain-engine embed — pipeline embeddings BE-2c')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Liste les chunks sans embed')
|
||||
parser.add_argument('--file', metavar='PATH', help='Réindexer un fichier spécifique')
|
||||
parser.add_argument('--stats', action='store_true', help='Stats de l\'index actuel')
|
||||
args = parser.parse_args()
|
||||
|
||||
run(dry_run=args.dry_run, target_file=args.file, stats_only=args.stats)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user