# As a condition of accessing this website, you agree to abide by the following
# content signals:

# (a)  If a Content-Signal = yes, you may collect content for the corresponding
#      use.
# (b)  If a Content-Signal = no, you may not collect content for the
#      corresponding use.
# (c)  If the website operator does not include a Content-Signal for a
#      corresponding use, the website operator neither grants nor restricts
#      permission via Content-Signal with respect to the corresponding use.

# The content signals and their meanings are:

# search:   building a search index and providing search results (e.g., returning
#           hyperlinks and short excerpts from your website's contents). Search does not
#           include providing AI-generated search summaries.
# ai-input: inputting content into one or more AI models (e.g., retrieval
#           augmented generation, grounding, or other real-time taking of content for
#           generative AI search answers).
# ai-train: training or fine-tuning AI models.

# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS RESERVATIONS OF
# RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN UNION DIRECTIVE 2019/790 ON COPYRIGHT
# AND RELATED RIGHTS IN THE DIGITAL SINGLE MARKET.

# BEGIN Cloudflare Managed content

User-agent: *
Content-Signal: search=yes,ai-train=no
Allow: /

User-agent: Amazonbot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: CloudflareBrowserRenderingCrawler
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: meta-externalagent
Disallow: /

# END Cloudflare Managed Content

User-agent: *
Allow: /
Disallow: /dashboard/
Disallow: /dashboard
Disallow: /success
Disallow: /api/

# OG card images are referenced from each page's `og:image` meta tag,
# crawlers fetch them through the page reference, not as standalone
# search results. Excluding the directory keeps Google's image index
# focused on actual content imagery instead of share-card variants.
Disallow: /og/

# Pagefind static-search index assets. Allowing crawl is harmless (the
# .pf_fragment / .pf_index files are content slices of pages that are
# already indexable), but explicitly allowing /pagefind/ prevents future
# rule additions from accidentally blocking the client-side search.
Allow: /pagefind/

# Be explicit about Googlebot / Bingbot getting full access to the
# main content directories. Some crawlers honour User-agent-specific
# rules differently from the wildcard, and the most important search
# bots should always have the most permissive applicable rule.
User-agent: Googlebot
Allow: /
Disallow: /dashboard
Disallow: /success
Disallow: /api/
Disallow: /og/

User-agent: Bingbot
Allow: /
Disallow: /dashboard
Disallow: /success
Disallow: /api/
Disallow: /og/

# ── AI crawlers ────────────────────────────────────────────────────────
#
# Posture: block crawlers that exist to train foundation models on our
# content; allow crawlers that fetch in real time to answer a specific
# user query (those drive discovery and traffic, not training).
#
# This list is updated as new agents appear. If you operate a crawler
# that respects robots.txt and is not listed, you can email
# mk@sentinelden.com to discuss bulk access for legitimate use cases.

# Training crawlers blocked. Listed individually rather than via a
# wildcard so the intent is auditable and future agents are not
# silently blocked or silently allowed.

# OpenAI training crawler (separate from ChatGPT-User which fetches
# in real time and is allowed below).
User-agent: GPTBot
Disallow: /

# Common Crawl, frequently used as an upstream training-data source.
User-agent: CCBot
Disallow: /

# Anthropic legacy training crawler.
User-agent: anthropic-ai
Disallow: /

# Anthropic current training crawler.
User-agent: ClaudeBot
Disallow: /

# Google Gemini / Bard training crawler. Blocking this does NOT affect
# Google Search ranking, which is governed by the Googlebot stanza
# above.
User-agent: Google-Extended
Disallow: /

# ByteDance / Doubao training crawler.
User-agent: Bytespider
Disallow: /

# Amazon model training crawler.
User-agent: Amazonbot
Disallow: /

# Meta / Llama training crawler.
User-agent: meta-externalagent
Disallow: /
User-agent: FacebookBot
Disallow: /

# Image-training crawler used by several model vendors.
User-agent: ImagesiftBot
Disallow: /

# Omgili / Webz.io aggregator commonly resold as training data.
User-agent: Omgilibot
Disallow: /
User-agent: Omgili
Disallow: /

# Apple's AI training crawler (separate from the regular Applebot used
# for Siri / Spotlight search results).
User-agent: Applebot-Extended
Disallow: /

# Cohere training crawler.
User-agent: cohere-ai
Disallow: /

# Real-time answer crawlers ALLOWED below. These fetch on behalf of a
# specific user query and surface the source in the answer, which
# drives traffic back to the site. Re-stated permissively so the
# wildcard above is not the source of authority for them.
User-agent: ChatGPT-User
Allow: /

User-agent: Claude-User
Allow: /

User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

Sitemap: https://sentinelden.com/sitemap-index.xml