feat: generate llms.txt from template and annotate entries with star counts

- Add llms.txt Jinja2 template with a categories_md placeholder - Extract categories body from README and inject it into the template - Annotate bullet-entry lines with GitHub star counts (N GitHub stars) for the main index.md and bare numbers for llms.txt - Add TestAnnotateEntriesWithStars unit tests Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-07 08:20:21 -05:00 · 2026-05-02 02:32:18 +08:00
parent d9f26a8635
commit 429c9b3d12
3 changed files with 169 additions and 3 deletions
--- a/website/build.py
+++ b/website/build.py
@@ -14,6 +14,8 @@ from jinja2 import Environment, FileSystemLoader
 from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors

 GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
+MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)")
+BULLET_LINE_RE = re.compile(r"^\s*-\s")
 SITE_URL = "https://awesome-python.com/"
 SITEMAP_URL = f"{SITE_URL}sitemap.xml"
 SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
@@ -104,6 +106,72 @@ def top_level_heading_text(line: str) -> str | None:
    return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()


+LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}"
+
+
+def extract_categories_body(markdown: str) -> str:
+    """Return content under the `# Categories` heading, excluding the heading line itself."""
+    lines = markdown.splitlines(keepends=True)
+    start_idx = None
+    end_idx = len(lines)
+    for i, line in enumerate(lines):
+        heading = top_level_heading_text(line)
+        if heading is None:
+            continue
+        if start_idx is None and heading.lower() == "categories":
+            start_idx = i + 1
+            while start_idx < len(lines) and lines[start_idx].strip() == "":
+                start_idx += 1
+        elif start_idx is not None and i >= start_idx:
+            end_idx = i
+            break
+    if start_idx is None:
+        return ""
+    return "".join(lines[start_idx:end_idx]).rstrip() + "\n"
+
+
+def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str:
+    """Render the llms.txt template by injecting the README's Categories body, then annotate stars."""
+    body = extract_categories_body(readme_text).rstrip()
+    rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body)
+    return annotate_entries_with_stars(rendered, stars_data, format_stars=str)
+
+
+def annotate_entries_with_stars(
+    markdown: str,
+    stars_data: dict[str, dict],
+    *,
+    format_stars=None,
+) -> str:
+    """Append the star count to bullet entry lines whose first GitHub link has known star data.
+
+    `format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars".
+    Pass `str` for a bare number.
+    """
+    if format_stars is None:
+        format_stars = lambda n: f"{n} GitHub stars"  # noqa: E731 lambda-assignment
+    lines = markdown.splitlines(keepends=True)
+    out: list[str] = []
+    for line in lines:
+        if not BULLET_LINE_RE.match(line):
+            out.append(line)
+            continue
+        annotated = line
+        for match in MARKDOWN_LINK_RE.finditer(line):
+            repo_key = extract_github_repo(match.group(1))
+            if not repo_key:
+                continue
+            entry = stars_data.get(repo_key)
+            if not entry or "stars" not in entry:
+                continue
+            stripped = line.rstrip("\n")
+            ending = line[len(stripped):]
+            annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}"
+            break
+        out.append(annotated)
+    return "".join(out)
+
+
 def remove_sponsors_section(markdown: str) -> str:
    lines = markdown.splitlines(keepends=True)
    start_idx = None
@@ -243,11 +311,15 @@ def build(repo_root: Path) -> None:
    if static_src.exists():
        shutil.copytree(static_src, static_dst, dirs_exist_ok=True)

-    markdown_index = remove_sponsors_section(readme_text)
+    markdown_index = annotate_entries_with_stars(
+        remove_sponsors_section(readme_text), stars_data
+    )
+    llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8")
+    llms_txt = build_llms_txt(llms_template, readme_text, stars_data)
    (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
    write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
    (site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
-    (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
+    (site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8")

    print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
    print(f"Total entries: {total_entries}")
--- a/website/templates/llms.txt
+++ b/website/templates/llms.txt
@@ -0,0 +1,9 @@
+# Awesome Python
+
+An opinionated guide to the best Python frameworks, libraries, tools, and resources.
+
+Use this curated list when you need to find a high-quality Python library or tool for tasks such as web development, data science, machine learning, AI agents, automation, testing, or DevOps. The trailing number on each entry is its star count on GitHub.
+
+# Categories
+
+{{ categories_md }}
--- a/website/tests/test_build.py
+++ b/website/tests/test_build.py
@@ -9,6 +9,7 @@ from html.parser import HTMLParser
 from pathlib import Path

 from build import (
+    annotate_entries_with_stars,
    build,
    detect_source_type,
    extract_entries,
@@ -108,6 +109,16 @@ class TestBuild:
            "{% endblock %}",
            encoding="utf-8",
        )
+        (tpl_dir / "llms.txt").write_text(
+            "# Awesome Python\n"
+            "\n"
+            "Use this list to find Python tools.\n"
+            "\n"
+            "# Categories\n"
+            "\n"
+            "{{ categories_md }}\n",
+            encoding="utf-8",
+        )

    def _copy_real_templates(self, tmp_path):
        real_tpl = Path(__file__).parent / ".." / "templates"
@@ -223,6 +234,7 @@ class TestBuild:
            ## Widgets

            - [w1](https://example.com) - A widget.
+            - [w2](https://github.com/owner/w2) - A starred widget.

            # Contributing

@@ -231,6 +243,13 @@ class TestBuild:
        (tmp_path / "README.md").write_text(readme, encoding="utf-8")
        self._copy_real_templates(tmp_path)

+        data_dir = tmp_path / "website" / "data"
+        data_dir.mkdir(parents=True)
+        stars = {
+            "owner/w2": {"stars": 42, "owner": "owner", "fetched_at": "2026-01-01T00:00:00+00:00"},
+        }
+        (data_dir / "github_stars.json").write_text(json.dumps(stars), encoding="utf-8")
+
        build(tmp_path)

        site = tmp_path / "website" / "output"
@@ -239,13 +258,23 @@ class TestBuild:
        llms_txt = (site / "llms.txt").read_text(encoding="utf-8")

        assert '<link rel="alternate" type="text/markdown" href="/index.md" />' in index_html
-        assert index_md == llms_txt
        assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories")
        assert "# **Sponsors**" not in index_md
        assert "Sponsor" not in index_md
        assert "SPONSORSHIP.md" not in index_md
        assert "## Widgets" in index_md
        assert "- [w1](https://example.com) - A widget." in index_md
+        assert "- [w2](https://github.com/owner/w2) - A starred widget. (42 GitHub stars)" in index_md
+
+        assert llms_txt.startswith("# Awesome Python\n")
+        assert "# Categories" in llms_txt
+        assert "Use this curated list" in llms_txt
+        assert "## Widgets" in llms_txt
+        assert "- [w1](https://example.com) - A widget." in llms_txt
+        assert "- [w2](https://github.com/owner/w2) - A starred widget. (42)" in llms_txt
+        assert "{{ categories_md }}" not in llms_txt
+        assert "# Contributing" not in llms_txt
+        assert "Help!" not in llms_txt

    def test_build_cleans_stale_output(self, tmp_path):
        readme = textwrap.dedent("""\
@@ -604,3 +633,59 @@ class TestExtractEntries:
        categories = [c for g in groups for c in g["categories"]]
        entries = extract_entries(categories, groups)
        assert entries[0]["source_type"] == "Built-in"
+
+
+# ---------------------------------------------------------------------------
+# annotate_entries_with_stars
+# ---------------------------------------------------------------------------
+
+
+class TestAnnotateEntriesWithStars:
+    def test_appends_star_count_to_bullet(self):
+        markdown = "- [foo](https://github.com/owner/foo) - A foo.\n"
+        stars = {"owner/foo": {"stars": 123, "owner": "owner"}}
+        assert annotate_entries_with_stars(markdown, stars) == (
+            "- [foo](https://github.com/owner/foo) - A foo. (123 GitHub stars)\n"
+        )
+
+    def test_uses_first_github_link(self):
+        markdown = (
+            "- [foo](https://github.com/owner/foo) - A foo. "
+            "Also [bar](https://github.com/owner/bar).\n"
+        )
+        stars = {
+            "owner/foo": {"stars": 10, "owner": "owner"},
+            "owner/bar": {"stars": 99, "owner": "owner"},
+        }
+        assert annotate_entries_with_stars(markdown, stars) == (
+            "- [foo](https://github.com/owner/foo) - A foo. "
+            "Also [bar](https://github.com/owner/bar). (10 GitHub stars)\n"
+        )
+
+    def test_skips_entries_without_star_data(self):
+        markdown = "- [foo](https://github.com/owner/foo) - A foo.\n"
+        assert annotate_entries_with_stars(markdown, {}) == markdown
+
+    def test_skips_non_github_links(self):
+        markdown = "- [foo](https://example.com) - A foo.\n"
+        stars = {"owner/foo": {"stars": 1, "owner": "owner"}}
+        assert annotate_entries_with_stars(markdown, stars) == markdown
+
+    def test_skips_non_bullet_lines(self):
+        markdown = "See [foo](https://github.com/owner/foo) for details.\n"
+        stars = {"owner/foo": {"stars": 1, "owner": "owner"}}
+        assert annotate_entries_with_stars(markdown, stars) == markdown
+
+    def test_handles_indented_bullets(self):
+        markdown = "    - [foo](https://github.com/owner/foo)\n"
+        stars = {"owner/foo": {"stars": 7, "owner": "owner"}}
+        assert annotate_entries_with_stars(markdown, stars) == (
+            "    - [foo](https://github.com/owner/foo) (7 GitHub stars)\n"
+        )
+
+    def test_preserves_lines_without_trailing_newline(self):
+        markdown = "- [foo](https://github.com/owner/foo) - A foo."
+        stars = {"owner/foo": {"stars": 5, "owner": "owner"}}
+        assert annotate_entries_with_stars(markdown, stars) == (
+            "- [foo](https://github.com/owner/foo) - A foo. (5 GitHub stars)"
+        )