Storage Paths and Data Layout
docsfy keeps persistent runtime state under DATA_DIR, with a clear split between:
- SQLite metadata (docsfy.db)
- per-variant filesystem artifacts (projects/...)
- generated static documentation site output (site/...)
DATA_DIR Usage
DATA_DIR is a first-class setting, defaulting to /data, and is wired into startup DB initialization.
# src/docsfy/config.py
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
admin_key: str = "" # Required — validated at startup
ai_provider: str = "claude"
ai_model: str = "claude-opus-4-6[1m]" # [1m] = 1 million token context window
ai_cli_timeout: int = Field(default=60, gt=0)
log_level: str = "INFO"
data_dir: str = "/data"
secure_cookies: bool = True # Set to False for local HTTP dev
# src/docsfy/main.py
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
settings = get_settings()
if not settings.admin_key:
logger.error("ADMIN_KEY environment variable is required")
raise SystemExit(1)
if len(settings.admin_key) < 16:
logger.error("ADMIN_KEY must be at least 16 characters long")
raise SystemExit(1)
_generating.clear()
await init_db(data_dir=settings.data_dir)
await cleanup_expired_sessions()
yield
# src/docsfy/storage.py
DB_PATH = Path(os.getenv("DATA_DIR", "/data")) / "docsfy.db"
DATA_DIR = Path(os.getenv("DATA_DIR", "/data"))
PROJECTS_DIR = DATA_DIR / "projects"
async def init_db(data_dir: str = "") -> None:
global DB_PATH, DATA_DIR, PROJECTS_DIR
if data_dir:
DB_PATH = Path(data_dir) / "docsfy.db"
DATA_DIR = Path(data_dir)
PROJECTS_DIR = DATA_DIR / "projects"
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
PROJECTS_DIR.mkdir(parents=True, exist_ok=True)
Note:
.env.exampledoes not currently includeDATA_DIR, but the app supports it viaSettings.data_dirandos.getenv("DATA_DIR", "/data").
SQLite DB Location and Contents
SQLite DB path:
- <DATA_DIR>/docsfy.db
The DB is initialized in init_db() and includes project metadata plus auth/session data.
# src/docsfy/storage.py
await db.execute("""
CREATE TABLE IF NOT EXISTS projects (
name TEXT NOT NULL,
ai_provider TEXT NOT NULL DEFAULT '',
ai_model TEXT NOT NULL DEFAULT '',
owner TEXT NOT NULL DEFAULT '',
repo_url TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'generating',
current_stage TEXT,
last_commit_sha TEXT,
last_generated TEXT,
page_count INTEGER DEFAULT 0,
error_message TEXT,
plan_json TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (name, ai_provider, ai_model, owner)
)
""")
Additional tables created in the same function:
- users
- project_access
- sessions
projects uses a 4-part key (name, ai_provider, ai_model, owner), which mirrors the on-disk variant path layout.
Project Filesystem Layout
Project artifacts are stored under <DATA_DIR>/projects/ and partitioned by owner, repo, provider, and model.
# src/docsfy/storage.py
def get_project_dir(
name: str, ai_provider: str = "", ai_model: str = "", owner: str = ""
) -> Path:
if not ai_provider or not ai_model:
msg = "ai_provider and ai_model are required for project directory paths"
raise ValueError(msg)
# Sanitize path segments to prevent traversal
for segment_name, segment in [("ai_provider", ai_provider), ("ai_model", ai_model)]:
if (
"/" in segment
or "\\" in segment
or ".." in segment
or segment.startswith(".")
):
msg = f"Invalid {segment_name}: '{segment}'"
raise ValueError(msg)
safe_owner = _validate_owner(owner)
return PROJECTS_DIR / safe_owner / _validate_name(name) / ai_provider / ai_model
def get_project_site_dir(
name: str, ai_provider: str = "", ai_model: str = "", owner: str = ""
) -> Path:
return get_project_dir(name, ai_provider, ai_model, owner) / "site"
def get_project_cache_dir(
name: str, ai_provider: str = "", ai_model: str = "", owner: str = ""
) -> Path:
return get_project_dir(name, ai_provider, ai_model, owner) / "cache" / "pages"
Expected tree for one variant:
<DATA_DIR>/
docsfy.db
projects/
<owner-or-_default>/
<project-name>/
<ai-provider>/
<ai-model>/
plan.json
cache/
pages/
<slug>.md
site/
.nojekyll
index.html
<slug>.html
<slug>.md
search-index.json
llms.txt
llms-full.txt
assets/
(copied files from src/docsfy/static/)
Owner fallback behavior is tested:
# tests/test_storage.py
path = get_project_dir("my-repo", "claude", "opus", "")
assert "_default" in str(path)
Project Cache Paths
Cache files are markdown pages stored at:
<DATA_DIR>/projects/<owner>/<project>/<provider>/<model>/cache/pages/<slug>.md
Write/read behavior:
# src/docsfy/generator.py
cache_file = cache_dir / f"{slug}.md"
if use_cache and cache_file.exists():
logger.debug(f"[{_label}] Using cached page: {slug}")
return cache_file.read_text(encoding="utf-8")
...
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file.write_text(output, encoding="utf-8")
Invalidation behavior in generation flow:
# src/docsfy/main.py
if force:
cache_dir = get_project_cache_dir(project_name, ai_provider, ai_model, owner)
if cache_dir.exists():
shutil.rmtree(cache_dir)
logger.info(f"[{project_name}] Cleared cache (force=True)")
# src/docsfy/main.py
cache_file = cache_dir / f"{slug}.md"
...
if cache_file.exists():
cache_file.unlink()
force=trueremoves the entire variant cache.- incremental regeneration removes only selected cached pages.
Generated Site Directories
Final rendered docs are written to each variant’s site/ directory, while plan.json is written in the variant root.
# src/docsfy/main.py
site_dir = get_project_site_dir(project_name, ai_provider, ai_model, owner)
render_site(plan=plan, pages=pages, output_dir=site_dir)
project_dir = get_project_dir(project_name, ai_provider, ai_model, owner)
(project_dir / "plan.json").write_text(json.dumps(plan, indent=2), encoding="utf-8")
render_site() fully rebuilds the output directory and writes the final artifact set:
# src/docsfy/renderer.py
def render_site(plan: dict[str, Any], pages: dict[str, str], output_dir: Path) -> None:
if output_dir.exists():
shutil.rmtree(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
assets_dir = output_dir / "assets"
assets_dir.mkdir(exist_ok=True)
# Prevent GitHub Pages from running Jekyll
(output_dir / ".nojekyll").touch()
...
(output_dir / "index.html").write_text(index_html, encoding="utf-8")
...
(output_dir / f"{slug}.html").write_text(page_html, encoding="utf-8")
(output_dir / f"{slug}.md").write_text(md_content, encoding="utf-8")
...
(output_dir / "search-index.json").write_text(
json.dumps(search_index), encoding="utf-8"
)
...
(output_dir / "llms.txt").write_text(llms_txt, encoding="utf-8")
(output_dir / "llms-full.txt").write_text(llms_full_txt, encoding="utf-8")
Warning:
render_site()deletes the previoussite/directory before writing new output. Treatsite/as generated output only.
Container and Runtime Path Mapping
Containerized runs are explicitly wired to /data for persistence:
# docker-compose.yaml
services:
docsfy:
build: .
ports:
- "8000:8000"
env_file: .env
volumes:
- ./data:/data
# Dockerfile
RUN useradd --create-home --shell /bin/bash -g 0 appuser \
&& mkdir -p /data \
&& chown appuser:0 /data \
&& chmod -R g+w /data
Generated data is intentionally not tracked in git:
# .gitignore
# Data
data/
.dev/data/
Tip: In Docker deployments, back up the host-side
./datadirectory to preserve bothdocsfy.dband generated docs artifacts.
Ephemeral (Non-persistent) Paths
Not all file activity is under DATA_DIR:
- Remote repo cloning uses a temporary directory.
- Download archives are created as temporary .tar.gz files and removed after streaming.
# src/docsfy/main.py
with tempfile.TemporaryDirectory() as tmp_dir:
repo_dir, commit_sha = await asyncio.to_thread(
clone_repo, repo_url, Path(tmp_dir)
)
# src/docsfy/main.py
tmp = tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False)
tar_path = Path(tmp.name)
tmp.close()
...
finally:
tar_path.unlink(missing_ok=True)
Note: This repository currently has no
.github/workflows/or.gitlab-ci.yml; storage behavior is defined by runtime code and container configuration.