feat: generate llms.txt from template and annotate entries with star counts

- Add llms.txt Jinja2 template with a categories_md placeholder
- Extract categories body from README and inject it into the template
- Annotate bullet-entry lines with GitHub star counts (N GitHub stars)
  for the main index.md and bare numbers for llms.txt
- Add TestAnnotateEntriesWithStars unit tests

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-05-02 02:32:18 +08:00
parent d9f26a8635
commit 429c9b3d12
3 changed files with 169 additions and 3 deletions

View File

@@ -14,6 +14,8 @@ from jinja2 import Environment, FileSystemLoader
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)")
BULLET_LINE_RE = re.compile(r"^\s*-\s")
SITE_URL = "https://awesome-python.com/"
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
@@ -104,6 +106,72 @@ def top_level_heading_text(line: str) -> str | None:
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}"
def extract_categories_body(markdown: str) -> str:
"""Return content under the `# Categories` heading, excluding the heading line itself."""
lines = markdown.splitlines(keepends=True)
start_idx = None
end_idx = len(lines)
for i, line in enumerate(lines):
heading = top_level_heading_text(line)
if heading is None:
continue
if start_idx is None and heading.lower() == "categories":
start_idx = i + 1
while start_idx < len(lines) and lines[start_idx].strip() == "":
start_idx += 1
elif start_idx is not None and i >= start_idx:
end_idx = i
break
if start_idx is None:
return ""
return "".join(lines[start_idx:end_idx]).rstrip() + "\n"
def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str:
"""Render the llms.txt template by injecting the README's Categories body, then annotate stars."""
body = extract_categories_body(readme_text).rstrip()
rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body)
return annotate_entries_with_stars(rendered, stars_data, format_stars=str)
def annotate_entries_with_stars(
markdown: str,
stars_data: dict[str, dict],
*,
format_stars=None,
) -> str:
"""Append the star count to bullet entry lines whose first GitHub link has known star data.
`format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars".
Pass `str` for a bare number.
"""
if format_stars is None:
format_stars = lambda n: f"{n} GitHub stars" # noqa: E731 lambda-assignment
lines = markdown.splitlines(keepends=True)
out: list[str] = []
for line in lines:
if not BULLET_LINE_RE.match(line):
out.append(line)
continue
annotated = line
for match in MARKDOWN_LINK_RE.finditer(line):
repo_key = extract_github_repo(match.group(1))
if not repo_key:
continue
entry = stars_data.get(repo_key)
if not entry or "stars" not in entry:
continue
stripped = line.rstrip("\n")
ending = line[len(stripped):]
annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}"
break
out.append(annotated)
return "".join(out)
def remove_sponsors_section(markdown: str) -> str:
lines = markdown.splitlines(keepends=True)
start_idx = None
@@ -243,11 +311,15 @@ def build(repo_root: Path) -> None:
if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
markdown_index = remove_sponsors_section(readme_text)
markdown_index = annotate_entries_with_stars(
remove_sponsors_section(readme_text), stars_data
)
llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8")
llms_txt = build_llms_txt(llms_template, readme_text, stars_data)
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
(site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
(site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8")
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
print(f"Total entries: {total_entries}")

View File

@@ -0,0 +1,9 @@
# Awesome Python
An opinionated guide to the best Python frameworks, libraries, tools, and resources.
Use this curated list when you need to find a high-quality Python library or tool for tasks such as web development, data science, machine learning, AI agents, automation, testing, or DevOps. The trailing number on each entry is its star count on GitHub.
# Categories
{{ categories_md }}

View File

@@ -9,6 +9,7 @@ from html.parser import HTMLParser
from pathlib import Path
from build import (
annotate_entries_with_stars,
build,
detect_source_type,
extract_entries,
@@ -108,6 +109,16 @@ class TestBuild:
"{% endblock %}",
encoding="utf-8",
)
(tpl_dir / "llms.txt").write_text(
"# Awesome Python\n"
"\n"
"Use this list to find Python tools.\n"
"\n"
"# Categories\n"
"\n"
"{{ categories_md }}\n",
encoding="utf-8",
)
def _copy_real_templates(self, tmp_path):
real_tpl = Path(__file__).parent / ".." / "templates"
@@ -223,6 +234,7 @@ class TestBuild:
## Widgets
- [w1](https://example.com) - A widget.
- [w2](https://github.com/owner/w2) - A starred widget.
# Contributing
@@ -231,6 +243,13 @@ class TestBuild:
(tmp_path / "README.md").write_text(readme, encoding="utf-8")
self._copy_real_templates(tmp_path)
data_dir = tmp_path / "website" / "data"
data_dir.mkdir(parents=True)
stars = {
"owner/w2": {"stars": 42, "owner": "owner", "fetched_at": "2026-01-01T00:00:00+00:00"},
}
(data_dir / "github_stars.json").write_text(json.dumps(stars), encoding="utf-8")
build(tmp_path)
site = tmp_path / "website" / "output"
@@ -239,13 +258,23 @@ class TestBuild:
llms_txt = (site / "llms.txt").read_text(encoding="utf-8")
assert '<link rel="alternate" type="text/markdown" href="/index.md" />' in index_html
assert index_md == llms_txt
assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories")
assert "# **Sponsors**" not in index_md
assert "Sponsor" not in index_md
assert "SPONSORSHIP.md" not in index_md
assert "## Widgets" in index_md
assert "- [w1](https://example.com) - A widget." in index_md
assert "- [w2](https://github.com/owner/w2) - A starred widget. (42 GitHub stars)" in index_md
assert llms_txt.startswith("# Awesome Python\n")
assert "# Categories" in llms_txt
assert "Use this curated list" in llms_txt
assert "## Widgets" in llms_txt
assert "- [w1](https://example.com) - A widget." in llms_txt
assert "- [w2](https://github.com/owner/w2) - A starred widget. (42)" in llms_txt
assert "{{ categories_md }}" not in llms_txt
assert "# Contributing" not in llms_txt
assert "Help!" not in llms_txt
def test_build_cleans_stale_output(self, tmp_path):
readme = textwrap.dedent("""\
@@ -604,3 +633,59 @@ class TestExtractEntries:
categories = [c for g in groups for c in g["categories"]]
entries = extract_entries(categories, groups)
assert entries[0]["source_type"] == "Built-in"
# ---------------------------------------------------------------------------
# annotate_entries_with_stars
# ---------------------------------------------------------------------------
class TestAnnotateEntriesWithStars:
def test_appends_star_count_to_bullet(self):
markdown = "- [foo](https://github.com/owner/foo) - A foo.\n"
stars = {"owner/foo": {"stars": 123, "owner": "owner"}}
assert annotate_entries_with_stars(markdown, stars) == (
"- [foo](https://github.com/owner/foo) - A foo. (123 GitHub stars)\n"
)
def test_uses_first_github_link(self):
markdown = (
"- [foo](https://github.com/owner/foo) - A foo. "
"Also [bar](https://github.com/owner/bar).\n"
)
stars = {
"owner/foo": {"stars": 10, "owner": "owner"},
"owner/bar": {"stars": 99, "owner": "owner"},
}
assert annotate_entries_with_stars(markdown, stars) == (
"- [foo](https://github.com/owner/foo) - A foo. "
"Also [bar](https://github.com/owner/bar). (10 GitHub stars)\n"
)
def test_skips_entries_without_star_data(self):
markdown = "- [foo](https://github.com/owner/foo) - A foo.\n"
assert annotate_entries_with_stars(markdown, {}) == markdown
def test_skips_non_github_links(self):
markdown = "- [foo](https://example.com) - A foo.\n"
stars = {"owner/foo": {"stars": 1, "owner": "owner"}}
assert annotate_entries_with_stars(markdown, stars) == markdown
def test_skips_non_bullet_lines(self):
markdown = "See [foo](https://github.com/owner/foo) for details.\n"
stars = {"owner/foo": {"stars": 1, "owner": "owner"}}
assert annotate_entries_with_stars(markdown, stars) == markdown
def test_handles_indented_bullets(self):
markdown = " - [foo](https://github.com/owner/foo)\n"
stars = {"owner/foo": {"stars": 7, "owner": "owner"}}
assert annotate_entries_with_stars(markdown, stars) == (
" - [foo](https://github.com/owner/foo) (7 GitHub stars)\n"
)
def test_preserves_lines_without_trailing_newline(self):
markdown = "- [foo](https://github.com/owner/foo) - A foo."
stars = {"owner/foo": {"stars": 5, "owner": "owner"}}
assert annotate_entries_with_stars(markdown, stars) == (
"- [foo](https://github.com/owner/foo) - A foo. (5 GitHub stars)"
)