What Is Tracked
Generation metadata (including commit SHA) is stored per variant (name, ai_provider, ai_model, owner) in SQLite:
```57:73:src/docsfy/storage.py CREATE TABLE IF NOT EXISTS projects ( name TEXT NOT NULL, ai_provider TEXT NOT NULL DEFAULT '', ai_model TEXT NOT NULL DEFAULT '', owner TEXT NOT NULL DEFAULT '', repo_url TEXT NOT NULL, status TEXT NOT NULL DEFAULT 'generating', current_stage TEXT, last_commit_sha TEXT, last_generated TEXT, page_count INTEGER DEFAULT 0, error_message TEXT, plan_json TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (name, ai_provider, ai_model, owner) )
> **Note:** Incremental behavior is variant-scoped, not just project-scoped. Different providers/models maintain independent commit and cache state.
## Commit Diff Detection
When generation starts (and `force` is not set), `docsfy` compares the stored commit SHA to the current repository SHA.
If the SHA is identical, it exits early as `up_to_date`:
```850:868:src/docsfy/main.py
if existing and existing.get("last_generated"):
old_sha = (
str(existing["last_commit_sha"])
if existing.get("last_commit_sha")
else None
)
if old_sha == commit_sha:
logger.info(
f"[{project_name}] Project is up to date at {commit_sha[:8]}"
)
await update_project_status(
project_name,
ai_provider,
ai_model,
status="ready",
owner=owner,
current_stage="up_to_date",
)
return
If SHAs differ, it computes file-level diffs using Git:
```48:73:src/docsfy/repository.py def get_changed_files(repo_path: Path, old_sha: str, new_sha: str) -> list[str] | None: """Get list of files changed between two commits.
Returns None on error (caller should fall back to full regeneration),
or an empty list when there are no changes.
"""
if not re.match(r"^[0-9a-fA-F]{4,64}$", old_sha) or not re.match(
r"^[0-9a-fA-F]{4,64}$", new_sha
):
logger.warning("Invalid SHA format")
return None
try:
result = subprocess.run(
["git", "diff", "--name-only", old_sha, new_sha],
cwd=repo_path,
capture_output=True,
text=True,
timeout=30,
)
except (subprocess.TimeoutExpired, OSError) as exc:
logger.warning(f"Failed to get diff: {exc}")
return None
if result.returncode != 0:
logger.warning(f"Failed to get diff: {result.stderr}")
return None
return [f.strip() for f in result.stdout.strip().split("\n") if f.strip()]
## Page-Level Cache Invalidation
After diff detection, the system runs an incremental planner and invalidates only selected cached pages (by slug), then reuses all other page caches.
```891:955:src/docsfy/main.py
if old_sha and old_sha != commit_sha and not force and existing:
changed_files = get_changed_files(repo_dir, old_sha, commit_sha)
if changed_files is None:
# Error getting diff — fall back to full regeneration
use_cache = False
elif not changed_files:
# Commits differ but tree is identical — nothing to regenerate
await update_project_status(
project_name,
ai_provider,
ai_model,
status="ready",
owner=owner,
current_stage="up_to_date",
last_commit_sha=commit_sha,
)
return
elif changed_files:
existing_plan_json = existing.get("plan_json")
if existing_plan_json:
try:
existing_plan = json.loads(str(existing_plan_json))
await update_project_status(
project_name,
ai_provider,
ai_model,
status="generating",
owner=owner,
current_stage="incremental_planning",
)
pages_to_regen = await run_incremental_planner(
repo_dir,
project_name,
ai_provider,
ai_model,
changed_files,
existing_plan,
ai_cli_timeout,
)
if pages_to_regen != ["all"]:
# Delete only the cached pages that need regeneration
for slug in pages_to_regen:
# Validate slug to prevent path traversal
if (
"/" in slug
or "\\" in slug
or ".." in slug
or slug.startswith(".")
):
logger.warning(
f"[{project_name}] Skipping invalid slug from incremental planner: {slug}"
)
continue
cache_file = cache_dir / f"{slug}.md"
# Extra safety: ensure the resolved path is inside cache_dir
try:
cache_file.resolve().relative_to(cache_dir.resolve())
except ValueError:
logger.warning(
f"[{project_name}] Path traversal attempt in slug: {slug}"
)
continue
if cache_file.exists():
cache_file.unlink()
use_cache = True
Page cache entries are slug-based markdown files:
```89:114:src/docsfy/generator.py cache_file = cache_dir / f"{slug}.md" if use_cache and cache_file.exists(): logger.debug(f"[{_label}] Using cached page: {slug}") return cache_file.read_text(encoding="utf-8")
prompt = build_page_prompt(
project_name=repo_path.name, page_title=title, page_description=description
)
# Build CLI flags based on provider
cli_flags = ["--trust"] if ai_provider == "cursor" else None
success, output = await call_ai_cli(
prompt=prompt,
cwd=repo_path,
ai_provider=ai_provider,
ai_model=ai_model,
ai_cli_timeout=ai_cli_timeout,
cli_flags=cli_flags,
)
if not success:
logger.warning(f"[{_label}] Failed to generate page '{slug}': {output}")
output = f"# {title}\n\n*Documentation generation failed. Please re-run.*"
output = _strip_ai_preamble(output)
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file.write_text(output, encoding="utf-8")
Cache directory resolution:
```527:530:src/docsfy/storage.py
def get_project_cache_dir(
name: str, ai_provider: str = "", ai_model: str = "", owner: str = ""
) -> Path:
return get_project_dir(name, ai_provider, ai_model, owner) / "cache" / "pages"
Tip: Because cache is per slug (
{slug}.md), incremental regeneration is fastest when page slugs remain stable across planner runs.
Fallback to Full Regeneration
There are three fallback signals in code:
- Diff failure (
changed_files is None) - Incremental planner failure / parse failure
- Incremental planner returning unusable output
Incremental planner fallback behavior:
```229:239:src/docsfy/generator.py if not success: logger.warning(f"[{project_name}] Incremental planner failed, regenerating all") return ["all"]
result = parse_json_list_response(output)
if result is None or not isinstance(result, list):
return ["all"]
# Validate all items are strings
result = [item for item in result if isinstance(item, str)]
if not result:
return ["all"]
Planner prompt contract includes both `["all"]` and `[]` outputs:
```56:63:src/docsfy/prompts.py
Which pages from the existing plan need to be regenerated based on the changed files?
Output a JSON array of page slugs that need regeneration.
CRITICAL: Output ONLY a JSON array of strings. No explanation.
Example: ["introduction", "api-reference", "configuration"]
If all pages need regeneration, output: ["all"]
If no pages need regeneration, output: []
Forced full regeneration is explicit and clears cache first:
```832:845:src/docsfy/main.py if force: cache_dir = get_project_cache_dir(project_name, ai_provider, ai_model, owner) if cache_dir.exists(): shutil.rmtree(cache_dir) logger.info(f"[{project_name}] Cleared cache (force=True)") # Reset page count so API shows 0 during regeneration await update_project_status( project_name, ai_provider, ai_model, status="generating", owner=owner, page_count=0, )
And `force` is exposed at API model level:
```18:20:src/docsfy/models.py
force: bool = Field(
default=False, description="Force full regeneration, ignoring cache"
)
Dashboard sends force in generation requests:
```2043:2047:src/docsfy/templates/dashboard.html var body = { repo_url: repoUrl, ai_provider: provider, force: force };
> **Warning:** For non-force runs, `generate_all_pages` is called with `use_cache=use_cache if use_cache else not force`, which evaluates to `True` whenever `force` is `False`. In practice, this means true “full regeneration” is guaranteed when `force=true` (cache is deleted), while automatic fallback branches depend on whether cache files were invalidated/removed first.
## Runtime and Deployment Impact on Cache
Cache and metadata persist when `/data` is mounted:
```7:13:docker-compose.yaml
volumes:
- ./data:/data
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
Warning: Remote repositories are cloned shallow (
--depth 1), which can prevent diffing against older stored SHAs if that commit is not present locally.
```25:27:src/docsfy/repository.py result = subprocess.run( ["git", "clone", "--depth", "1", "--", repo_url, str(repo_path)],
## Test and Pipeline Coverage
Key tests validate incremental/cache behaviors:
- Diff outcomes (`list`, `None`, empty list) in `tests/test_repository.py`
- Cache hit behavior in `tests/test_generator.py`
- Incremental planner fallback to `["all"]` in `tests/test_generator.py`
```85:124:tests/test_repository.py
def test_get_changed_files_success(tmp_path: Path) -> None:
from docsfy.repository import get_changed_files
with patch("docsfy.repository.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="src/main.py\nsrc/utils.py\nREADME.md\n",
stderr="",
)
files = get_changed_files(tmp_path, "abc123", "def456")
assert files == ["src/main.py", "src/utils.py", "README.md"]
call_args = mock_run.call_args
assert "diff" in call_args.args[0]
assert "--name-only" in call_args.args[0]
assert "abc123" in call_args.args[0]
assert "def456" in call_args.args[0]
```103:123:tests/test_generator.py async def test_generate_page_uses_cache(tmp_path: Path) -> None: from docsfy.generator import generate_page
cache_dir = tmp_path / "cache"
cache_dir.mkdir()
cached = cache_dir / "introduction.md"
cached.write_text("# Cached content")
md = await generate_page(
repo_path=tmp_path,
slug="introduction",
title="Introduction",
description="Overview",
cache_dir=cache_dir,
ai_provider="claude",
ai_model="opus",
use_cache=True,
)
assert md == "# Cached content"
```144:183:tests/test_generator.py
async def test_run_incremental_planner_returns_all_on_failure(
tmp_path: Path, sample_plan: dict
) -> None:
from docsfy.generator import run_incremental_planner
with patch(
"docsfy.generator.call_ai_cli",
return_value=(False, "AI error"),
):
result = await run_incremental_planner(
repo_path=tmp_path,
project_name="test-repo",
ai_provider="claude",
ai_model="opus",
changed_files=["src/main.py"],
existing_plan=sample_plan,
)
assert result == ["all"]
Project automation used for CI-style validation in-repo:
```1:7:tox.toml skipsdist = true
envlist = ["unittests"]
[env.unittests] deps = ["uv"] commands = [["uv", "run", "--extra", "dev", "pytest", "-n", "auto", "tests"]]
```43:60:.pre-commit-config.yaml
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.2
hooks:
- id: ruff
- id: ruff-format
- repo: https://github.com/gitleaks/gitleaks
rev: v8.30.0
hooks:
- id: gitleaks
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.19.1
hooks:
- id: mypy
exclude: (tests/)
additional_dependencies:
[types-requests, types-PyYAML, types-colorama, types-aiofiles, pydantic, types-Markdown]