Incremental Regeneration and Cache

What Is Tracked

Generation metadata (including commit SHA) is stored per variant (name, ai_provider, ai_model, owner) in SQLite:

```57:73:src/docsfy/storage.py CREATE TABLE IF NOT EXISTS projects ( name TEXT NOT NULL, ai_provider TEXT NOT NULL DEFAULT '', ai_model TEXT NOT NULL DEFAULT '', owner TEXT NOT NULL DEFAULT '', repo_url TEXT NOT NULL, status TEXT NOT NULL DEFAULT 'generating', current_stage TEXT, last_commit_sha TEXT, last_generated TEXT, page_count INTEGER DEFAULT 0, error_message TEXT, plan_json TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (name, ai_provider, ai_model, owner) )

> **Note:** Incremental behavior is variant-scoped, not just project-scoped. Different providers/models maintain independent commit and cache state.

## Commit Diff Detection

When generation starts (and `force` is not set), `docsfy` compares the stored commit SHA to the current repository SHA.

If the SHA is identical, it exits early as `up_to_date`:

```850:868:src/docsfy/main.py
        if existing and existing.get("last_generated"):
            old_sha = (
                str(existing["last_commit_sha"])
                if existing.get("last_commit_sha")
                else None
            )
            if old_sha == commit_sha:
                logger.info(
                    f"[{project_name}] Project is up to date at {commit_sha[:8]}"
                )
                await update_project_status(
                    project_name,
                    ai_provider,
                    ai_model,
                    status="ready",
                    owner=owner,
                    current_stage="up_to_date",
                )
                return

If SHAs differ, it computes file-level diffs using Git:

```48:73:src/docsfy/repository.py def get_changed_files(repo_path: Path, old_sha: str, new_sha: str) -> list[str] | None: """Get list of files changed between two commits.

Returns None on error (caller should fall back to full regeneration),
or an empty list when there are no changes.
"""
if not re.match(r"^[0-9a-fA-F]{4,64}$", old_sha) or not re.match(
    r"^[0-9a-fA-F]{4,64}$", new_sha
):
    logger.warning("Invalid SHA format")
    return None
try:
    result = subprocess.run(
        ["git", "diff", "--name-only", old_sha, new_sha],
        cwd=repo_path,
        capture_output=True,
        text=True,
        timeout=30,
    )
except (subprocess.TimeoutExpired, OSError) as exc:
    logger.warning(f"Failed to get diff: {exc}")
    return None
if result.returncode != 0:
    logger.warning(f"Failed to get diff: {result.stderr}")
    return None
return [f.strip() for f in result.stdout.strip().split("\n") if f.strip()]

## Page-Level Cache Invalidation

After diff detection, the system runs an incremental planner and invalidates only selected cached pages (by slug), then reuses all other page caches.

```891:955:src/docsfy/main.py
    if old_sha and old_sha != commit_sha and not force and existing:
        changed_files = get_changed_files(repo_dir, old_sha, commit_sha)
        if changed_files is None:
            # Error getting diff — fall back to full regeneration
            use_cache = False
        elif not changed_files:
            # Commits differ but tree is identical — nothing to regenerate
            await update_project_status(
                project_name,
                ai_provider,
                ai_model,
                status="ready",
                owner=owner,
                current_stage="up_to_date",
                last_commit_sha=commit_sha,
            )
            return
        elif changed_files:
            existing_plan_json = existing.get("plan_json")
            if existing_plan_json:
                try:
                    existing_plan = json.loads(str(existing_plan_json))
                    await update_project_status(
                        project_name,
                        ai_provider,
                        ai_model,
                        status="generating",
                        owner=owner,
                        current_stage="incremental_planning",
                    )
                    pages_to_regen = await run_incremental_planner(
                        repo_dir,
                        project_name,
                        ai_provider,
                        ai_model,
                        changed_files,
                        existing_plan,
                        ai_cli_timeout,
                    )
                    if pages_to_regen != ["all"]:
                        # Delete only the cached pages that need regeneration
                        for slug in pages_to_regen:
                            # Validate slug to prevent path traversal
                            if (
                                "/" in slug
                                or "\\" in slug
                                or ".." in slug
                                or slug.startswith(".")
                            ):
                                logger.warning(
                                    f"[{project_name}] Skipping invalid slug from incremental planner: {slug}"
                                )
                                continue
                            cache_file = cache_dir / f"{slug}.md"
                            # Extra safety: ensure the resolved path is inside cache_dir
                            try:
                                cache_file.resolve().relative_to(cache_dir.resolve())
                            except ValueError:
                                logger.warning(
                                    f"[{project_name}] Path traversal attempt in slug: {slug}"
                                )
                                continue
                            if cache_file.exists():
                                cache_file.unlink()
                        use_cache = True

Page cache entries are slug-based markdown files:

```89:114:src/docsfy/generator.py cache_file = cache_dir / f"{slug}.md" if use_cache and cache_file.exists(): logger.debug(f"[{_label}] Using cached page: {slug}") return cache_file.read_text(encoding="utf-8")

prompt = build_page_prompt(
    project_name=repo_path.name, page_title=title, page_description=description
)
# Build CLI flags based on provider
cli_flags = ["--trust"] if ai_provider == "cursor" else None
success, output = await call_ai_cli(
    prompt=prompt,
    cwd=repo_path,
    ai_provider=ai_provider,
    ai_model=ai_model,
    ai_cli_timeout=ai_cli_timeout,
    cli_flags=cli_flags,
)
if not success:
    logger.warning(f"[{_label}] Failed to generate page '{slug}': {output}")
    output = f"# {title}\n\n*Documentation generation failed. Please re-run.*"

output = _strip_ai_preamble(output)
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file.write_text(output, encoding="utf-8")

Cache directory resolution:

```527:530:src/docsfy/storage.py
def get_project_cache_dir(
    name: str, ai_provider: str = "", ai_model: str = "", owner: str = ""
) -> Path:
    return get_project_dir(name, ai_provider, ai_model, owner) / "cache" / "pages"

Tip: Because cache is per slug ({slug}.md), incremental regeneration is fastest when page slugs remain stable across planner runs.

Fallback to Full Regeneration

There are three fallback signals in code:

Diff failure (changed_files is None)
Incremental planner failure / parse failure
Incremental planner returning unusable output

Incremental planner fallback behavior:

```229:239:src/docsfy/generator.py if not success: logger.warning(f"[{project_name}] Incremental planner failed, regenerating all") return ["all"]

result = parse_json_list_response(output)
if result is None or not isinstance(result, list):
    return ["all"]
# Validate all items are strings
result = [item for item in result if isinstance(item, str)]
if not result:
    return ["all"]

Planner prompt contract includes both `["all"]` and `[]` outputs:

```56:63:src/docsfy/prompts.py
Which pages from the existing plan need to be regenerated based on the changed files?
Output a JSON array of page slugs that need regeneration.

CRITICAL: Output ONLY a JSON array of strings. No explanation.
Example: ["introduction", "api-reference", "configuration"]
If all pages need regeneration, output: ["all"]
If no pages need regeneration, output: []

Forced full regeneration is explicit and clears cache first:

```832:845:src/docsfy/main.py if force: cache_dir = get_project_cache_dir(project_name, ai_provider, ai_model, owner) if cache_dir.exists(): shutil.rmtree(cache_dir) logger.info(f"[{project_name}] Cleared cache (force=True)") # Reset page count so API shows 0 during regeneration await update_project_status( project_name, ai_provider, ai_model, status="generating", owner=owner, page_count=0, )

And `force` is exposed at API model level:

```18:20:src/docsfy/models.py
    force: bool = Field(
        default=False, description="Force full regeneration, ignoring cache"
    )

Dashboard sends force in generation requests:

```2043:2047:src/docsfy/templates/dashboard.html var body = { repo_url: repoUrl, ai_provider: provider, force: force };

> **Warning:** For non-force runs, `generate_all_pages` is called with `use_cache=use_cache if use_cache else not force`, which evaluates to `True` whenever `force` is `False`. In practice, this means true “full regeneration” is guaranteed when `force=true` (cache is deleted), while automatic fallback branches depend on whether cache files were invalidated/removed first.

## Runtime and Deployment Impact on Cache

Cache and metadata persist when `/data` is mounted:

```7:13:docker-compose.yaml
    volumes:
      - ./data:/data
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3

Warning: Remote repositories are cloned shallow (--depth 1), which can prevent diffing against older stored SHAs if that commit is not present locally.

```25:27:src/docsfy/repository.py result = subprocess.run( ["git", "clone", "--depth", "1", "--", repo_url, str(repo_path)],

## Test and Pipeline Coverage

Key tests validate incremental/cache behaviors:

- Diff outcomes (`list`, `None`, empty list) in `tests/test_repository.py`
- Cache hit behavior in `tests/test_generator.py`
- Incremental planner fallback to `["all"]` in `tests/test_generator.py`

```85:124:tests/test_repository.py
def test_get_changed_files_success(tmp_path: Path) -> None:
    from docsfy.repository import get_changed_files

    with patch("docsfy.repository.subprocess.run") as mock_run:
        mock_run.return_value = MagicMock(
            returncode=0,
            stdout="src/main.py\nsrc/utils.py\nREADME.md\n",
            stderr="",
        )
        files = get_changed_files(tmp_path, "abc123", "def456")

    assert files == ["src/main.py", "src/utils.py", "README.md"]
    call_args = mock_run.call_args
    assert "diff" in call_args.args[0]
    assert "--name-only" in call_args.args[0]
    assert "abc123" in call_args.args[0]
    assert "def456" in call_args.args[0]

```103:123:tests/test_generator.py async def test_generate_page_uses_cache(tmp_path: Path) -> None: from docsfy.generator import generate_page

cache_dir = tmp_path / "cache"
cache_dir.mkdir()
cached = cache_dir / "introduction.md"
cached.write_text("# Cached content")

md = await generate_page(
    repo_path=tmp_path,
    slug="introduction",
    title="Introduction",
    description="Overview",
    cache_dir=cache_dir,
    ai_provider="claude",
    ai_model="opus",
    use_cache=True,
)

assert md == "# Cached content"

```144:183:tests/test_generator.py
async def test_run_incremental_planner_returns_all_on_failure(
    tmp_path: Path, sample_plan: dict
) -> None:
    from docsfy.generator import run_incremental_planner

    with patch(
        "docsfy.generator.call_ai_cli",
        return_value=(False, "AI error"),
    ):
        result = await run_incremental_planner(
            repo_path=tmp_path,
            project_name="test-repo",
            ai_provider="claude",
            ai_model="opus",
            changed_files=["src/main.py"],
            existing_plan=sample_plan,
        )

    assert result == ["all"]

Project automation used for CI-style validation in-repo:

```1:7:tox.toml skipsdist = true

envlist = ["unittests"]

[env.unittests] deps = ["uv"] commands = [["uv", "run", "--extra", "dev", "pytest", "-n", "auto", "tests"]]

```43:60:.pre-commit-config.yaml
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.15.2
    hooks:
      - id: ruff
      - id: ruff-format

  - repo: https://github.com/gitleaks/gitleaks
    rev: v8.30.0
    hooks:
      - id: gitleaks

  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.19.1
    hooks:
      - id: mypy
        exclude: (tests/)
        additional_dependencies:
          [types-requests, types-PyYAML, types-colorama, types-aiofiles, pydantic, types-Markdown]