seo-postbuild.sh 113 lignes · 4377 octets
#!/usr/bin/env bash
# Post-traitement SEO après `mdbook build` / build-all-langs.sh.
#
# - Génère un sitemap.xml global et par langue
# - Copie robots.txt et llms.txt depuis seo/ vers book/
#
# Usage : ./scripts/seo-postbuild.sh [BASE_URL]
# Par défaut BASE_URL = https://demo.gitrust.eu/docs

set -euo pipefail

ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "$ROOT"

BASE_URL="${1:-https://demo.gitrust.eu/docs}"
LANGS=(fr en de es pt it)
BOOK_DIR="book"

if [[ ! -d "$BOOK_DIR" ]]; then
  echo "$BOOK_DIR/ introuvable — lance d'abord build-all-langs.sh" >&2
  exit 1
fi

# ---------------------------------------------------------------------------
# robots.txt + llms.txt à la racine (avec URL d'origine)
# ---------------------------------------------------------------------------
sed "s|https://demo.gitrust.eu/docs|$BASE_URL|g" seo/robots.txt > "$BOOK_DIR/robots.txt"
sed "s|https://demo.gitrust.eu/docs|$BASE_URL|g" seo/llms.txt > "$BOOK_DIR/llms.txt"
echo "✓ robots.txt et llms.txt copiés vers $BOOK_DIR/"

# ---------------------------------------------------------------------------
# Correction des URLs injectées par theme/head.hbs : la variable {{ path }} de
# mdBook produit un chemin en .md — on remplace par .html dans les tags SEO
# (canonical, hreflang, og:url, JSON-LD url). On remplace aussi la base URL.
# ---------------------------------------------------------------------------
echo "→ Normalisation des URLs SEO dans les .html (.md → .html, base URL)"
find "$BOOK_DIR" -name "*.html" -print0 | xargs -0 sed -i \
  -e 's|\(demo\.gitrust\.eu/docs/[a-z/_0-9-]*\)\.md|\1.html|g' \
  -e "s|https://demo.gitrust.eu/docs|$BASE_URL|g"

# ---------------------------------------------------------------------------
# Sitemap par langue
# ---------------------------------------------------------------------------
for lang in "${LANGS[@]}"; do
  lang_dir="$BOOK_DIR/$lang"
  [[ -d "$lang_dir" ]] || continue

  sitemap="$lang_dir/sitemap.xml"
  now="$(date -u +%Y-%m-%dT%H:%M:%SZ)"

  {
    echo '<?xml version="1.0" encoding="UTF-8"?>'
    echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"'
    echo '        xmlns:xhtml="http://www.w3.org/1999/xhtml">'

    # Lister chaque page HTML de la langue (sauf les pages "print" mdbook)
    while IFS= read -r page; do
      rel="${page#$lang_dir/}"
      url="$BASE_URL/$lang/$rel"
      echo "  <url>"
      echo "    <loc>$url</loc>"
      echo "    <lastmod>$now</lastmod>"
      # Alternates hreflang vers les 5 autres langues
      for alt in "${LANGS[@]}"; do
        alt_url="$BASE_URL/$alt/$rel"
        echo "    <xhtml:link rel=\"alternate\" hreflang=\"$alt\" href=\"$alt_url\"/>"
      done
      echo "    <xhtml:link rel=\"alternate\" hreflang=\"x-default\" href=\"$BASE_URL/fr/$rel\"/>"
      # Priorité heuristique : index > tutorial > how-to > reference > explanation
      case "$rel" in
        index.html) prio="1.0" ;;
        *tutorials/*) prio="0.8" ;;
        *how-to/*)    prio="0.7" ;;
        *reference/*) prio="0.6" ;;
        *explanation/*) prio="0.5" ;;
        *) prio="0.5" ;;
      esac
      echo "    <priority>$prio</priority>"
      echo "  </url>"
    done < <(find "$lang_dir" -type f -name "*.html" ! -name "print.html" ! -name "404.html" | sort)

    echo '</urlset>'
  } > "$sitemap"

  pages=$(grep -c "<url>" "$sitemap")
  echo "$sitemap ($pages pages)"
done

# ---------------------------------------------------------------------------
# Sitemap index (racine) qui référence les 6 sitemaps par langue
# ---------------------------------------------------------------------------
{
  echo '<?xml version="1.0" encoding="UTF-8"?>'
  echo '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
  now="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
  for lang in "${LANGS[@]}"; do
    if [[ -f "$BOOK_DIR/$lang/sitemap.xml" ]]; then
      echo "  <sitemap>"
      echo "    <loc>$BASE_URL/$lang/sitemap.xml</loc>"
      echo "    <lastmod>$now</lastmod>"
      echo "  </sitemap>"
    fi
  done
  echo '</sitemapindex>'
} > "$BOOK_DIR/sitemap.xml"

echo "$BOOK_DIR/sitemap.xml (index de $((${#LANGS[@]})) sitemaps par langue)"
echo ""
echo "Artefacts SEO produits :"
echo "  - $BOOK_DIR/robots.txt"
echo "  - $BOOK_DIR/llms.txt"
echo "  - $BOOK_DIR/sitemap.xml (index)"
echo "  - $BOOK_DIR/<lang>/sitemap.xml × ${#LANGS[@]}"