# CorpusIQ robots.txt
# Canonical host: www.corpusiq.io. The bare apex (corpusiq.io) and the
# legacy .app variants 308 to www at the platform layer (Vercel Domains)
# and at the Next.js layer via the host-rule redirects at the top of
# next.config.ts. Per Decision #10 (2026-05-04), only one Sitemap line is
# allowed and it MUST point at the www host.
# Last reviewed: 2026-05-04
#
# Every block below documents why the bot is allowed or blocked.
# See /ai.txt for the spawning.ai style allowlist for AI training.
# See /llms.txt for the structured LLM site summary.

# --- Search engines ---
# Googlebot. Primary organic search. Allow everything public.
User-agent: Googlebot
Allow: /

# Bingbot. Primary for Bing, DuckDuckGo, and Microsoft Copilot. Allow.
User-agent: Bingbot
Allow: /

# DuckDuckBot. DuckDuckGo's own crawler. Allow for traditional search.
User-agent: DuckDuckBot
Allow: /

# --- AI assistants and retrieval agents ---
# GPTBot. OpenAI training crawler. Allow so CorpusIQ public pages are cited in ChatGPT.
User-agent: GPTBot
Allow: /

# ChatGPT-User. Interactive ChatGPT browsing on a user's behalf. Allow.
User-agent: ChatGPT-User
Allow: /

# OAI-SearchBot. OpenAI's SearchGPT indexer. Allow for search-mode citations.
User-agent: OAI-SearchBot
Allow: /

# ClaudeBot. Anthropic's crawler for Claude. Allow for citation in Claude chats.
User-agent: ClaudeBot
Allow: /

# Claude-User. Anthropic's current user-initiated browsing agent. Allow.
# Per CLAUDE.md "AI Crawler Posture" allowlist.
User-agent: Claude-User
Allow: /

# Claude-Web. Anthropic's older web-browsing user agent. Kept for back-compat.
User-agent: Claude-Web
Allow: /

# anthropic-ai. Older Anthropic user agent string. Allow for historical continuity.
User-agent: anthropic-ai
Allow: /

# PerplexityBot. Perplexity's index crawler. Allow for Perplexity answer citations.
User-agent: PerplexityBot
Allow: /

# Perplexity-User. Interactive fetch by Perplexity on behalf of a user. Allow.
User-agent: Perplexity-User
Allow: /

# Google-Extended. Google's separate opt-in for additional Google AI training crawls. Allow.
User-agent: Google-Extended
Allow: /

# CCBot. Common Crawl. Foundation corpus for many AI systems. Allow.
User-agent: CCBot
Allow: /

# Applebot. Apple's crawler for Siri and Spotlight. Allow.
User-agent: Applebot
Allow: /

# Applebot-Extended. Apple's opt-in for Apple Intelligence training. Allow.
User-agent: Applebot-Extended
Allow: /

# Amazonbot. Amazon's crawler behind Alexa and product search. Allow.
User-agent: Amazonbot
Allow: /

# Meta-ExternalAgent. Meta AI training agent. Allow.
User-agent: Meta-ExternalAgent
Allow: /

# FacebookBot. Meta link preview crawler. Allow for social unfurl quality.
User-agent: FacebookBot
Allow: /

# DuckAssistBot. DuckDuckGo Assist AI. Allow.
User-agent: DuckAssistBot
Allow: /

# Cohere-ai. Cohere Compass and related services. Allow.
User-agent: Cohere-ai
Allow: /

# YouBot. You.com. Allow.
User-agent: YouBot
Allow: /

# --- Explicitly blocked crawlers ---
# Bytespider (ByteDance). Known for aggressive crawl rates and weak downstream
# traffic quality. Blocking to preserve crawl budget for bots that produce
# real citations. Review annually.
User-agent: Bytespider
Disallow: /

# --- Default rules for all other agents ---
# /_next/static/* is intentionally NOT blocked. These are the compiled JS
# and CSS bundles every bot needs to render the site correctly. Blocking
# them breaks rendering for any crawler not on the allowlist above.
# /_next/data/ is blocked because it exposes server-side data fetches that
# are internal to Next.js route prefetching.
User-agent: *
Allow: /
Allow: /_next/static/
Disallow: /_next/data/
Disallow: /api/
Disallow: /admin/
Disallow: /dashboard/
Disallow: /login/
Disallow: /register/
Disallow: /oauth/

Sitemap: https://www.corpusiq.io/sitemap.xml