From 429c9b3d12061d3d7fac3c5528fe35f3451a5dab Mon Sep 17 00:00:00 2001 From: Vinta Chen Date: Sat, 2 May 2026 02:32:18 +0800 Subject: [PATCH] feat: generate llms.txt from template and annotate entries with star counts - Add llms.txt Jinja2 template with a categories_md placeholder - Extract categories body from README and inject it into the template - Annotate bullet-entry lines with GitHub star counts (N GitHub stars) for the main index.md and bare numbers for llms.txt - Add TestAnnotateEntriesWithStars unit tests Co-Authored-By: Claude --- website/build.py | 76 +++++++++++++++++++++++++++++++- website/templates/llms.txt | 9 ++++ website/tests/test_build.py | 87 ++++++++++++++++++++++++++++++++++++- 3 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 website/templates/llms.txt diff --git a/website/build.py b/website/build.py index 8fb5f384..f9e3aa55 100644 --- a/website/build.py +++ b/website/build.py @@ -14,6 +14,8 @@ from jinja2 import Environment, FileSystemLoader from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") +MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)") +BULLET_LINE_RE = re.compile(r"^\s*-\s") SITE_URL = "https://awesome-python.com/" SITEMAP_URL = f"{SITE_URL}sitemap.xml" SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" @@ -104,6 +106,72 @@ def top_level_heading_text(line: str) -> str | None: return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip() +LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}" + + +def extract_categories_body(markdown: str) -> str: + """Return content under the `# Categories` heading, excluding the heading line itself.""" + lines = markdown.splitlines(keepends=True) + start_idx = None + end_idx = len(lines) + for i, line in enumerate(lines): + heading = top_level_heading_text(line) + if heading is None: + continue + if start_idx is None and heading.lower() == "categories": + start_idx = i + 1 + while start_idx < len(lines) and lines[start_idx].strip() == "": + start_idx += 1 + elif start_idx is not None and i >= start_idx: + end_idx = i + break + if start_idx is None: + return "" + return "".join(lines[start_idx:end_idx]).rstrip() + "\n" + + +def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str: + """Render the llms.txt template by injecting the README's Categories body, then annotate stars.""" + body = extract_categories_body(readme_text).rstrip() + rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body) + return annotate_entries_with_stars(rendered, stars_data, format_stars=str) + + +def annotate_entries_with_stars( + markdown: str, + stars_data: dict[str, dict], + *, + format_stars=None, +) -> str: + """Append the star count to bullet entry lines whose first GitHub link has known star data. + + `format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars". + Pass `str` for a bare number. + """ + if format_stars is None: + format_stars = lambda n: f"{n} GitHub stars" # noqa: E731 lambda-assignment + lines = markdown.splitlines(keepends=True) + out: list[str] = [] + for line in lines: + if not BULLET_LINE_RE.match(line): + out.append(line) + continue + annotated = line + for match in MARKDOWN_LINK_RE.finditer(line): + repo_key = extract_github_repo(match.group(1)) + if not repo_key: + continue + entry = stars_data.get(repo_key) + if not entry or "stars" not in entry: + continue + stripped = line.rstrip("\n") + ending = line[len(stripped):] + annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}" + break + out.append(annotated) + return "".join(out) + + def remove_sponsors_section(markdown: str) -> str: lines = markdown.splitlines(keepends=True) start_idx = None @@ -243,11 +311,15 @@ def build(repo_root: Path) -> None: if static_src.exists(): shutil.copytree(static_src, static_dst, dirs_exist_ok=True) - markdown_index = remove_sponsors_section(readme_text) + markdown_index = annotate_entries_with_stars( + remove_sponsors_section(readme_text), stars_data + ) + llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8") + llms_txt = build_llms_txt(llms_template, readme_text, stars_data) (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8") write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())]) (site_dir / "index.md").write_text(markdown_index, encoding="utf-8") - (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8") + (site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8") print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories") print(f"Total entries: {total_entries}") diff --git a/website/templates/llms.txt b/website/templates/llms.txt new file mode 100644 index 00000000..1db05c3b --- /dev/null +++ b/website/templates/llms.txt @@ -0,0 +1,9 @@ +# Awesome Python + +An opinionated guide to the best Python frameworks, libraries, tools, and resources. + +Use this curated list when you need to find a high-quality Python library or tool for tasks such as web development, data science, machine learning, AI agents, automation, testing, or DevOps. The trailing number on each entry is its star count on GitHub. + +# Categories + +{{ categories_md }} diff --git a/website/tests/test_build.py b/website/tests/test_build.py index 1feab77d..32b01917 100644 --- a/website/tests/test_build.py +++ b/website/tests/test_build.py @@ -9,6 +9,7 @@ from html.parser import HTMLParser from pathlib import Path from build import ( + annotate_entries_with_stars, build, detect_source_type, extract_entries, @@ -108,6 +109,16 @@ class TestBuild: "{% endblock %}", encoding="utf-8", ) + (tpl_dir / "llms.txt").write_text( + "# Awesome Python\n" + "\n" + "Use this list to find Python tools.\n" + "\n" + "# Categories\n" + "\n" + "{{ categories_md }}\n", + encoding="utf-8", + ) def _copy_real_templates(self, tmp_path): real_tpl = Path(__file__).parent / ".." / "templates" @@ -223,6 +234,7 @@ class TestBuild: ## Widgets - [w1](https://example.com) - A widget. + - [w2](https://github.com/owner/w2) - A starred widget. # Contributing @@ -231,6 +243,13 @@ class TestBuild: (tmp_path / "README.md").write_text(readme, encoding="utf-8") self._copy_real_templates(tmp_path) + data_dir = tmp_path / "website" / "data" + data_dir.mkdir(parents=True) + stars = { + "owner/w2": {"stars": 42, "owner": "owner", "fetched_at": "2026-01-01T00:00:00+00:00"}, + } + (data_dir / "github_stars.json").write_text(json.dumps(stars), encoding="utf-8") + build(tmp_path) site = tmp_path / "website" / "output" @@ -239,13 +258,23 @@ class TestBuild: llms_txt = (site / "llms.txt").read_text(encoding="utf-8") assert '' in index_html - assert index_md == llms_txt assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories") assert "# **Sponsors**" not in index_md assert "Sponsor" not in index_md assert "SPONSORSHIP.md" not in index_md assert "## Widgets" in index_md assert "- [w1](https://example.com) - A widget." in index_md + assert "- [w2](https://github.com/owner/w2) - A starred widget. (42 GitHub stars)" in index_md + + assert llms_txt.startswith("# Awesome Python\n") + assert "# Categories" in llms_txt + assert "Use this curated list" in llms_txt + assert "## Widgets" in llms_txt + assert "- [w1](https://example.com) - A widget." in llms_txt + assert "- [w2](https://github.com/owner/w2) - A starred widget. (42)" in llms_txt + assert "{{ categories_md }}" not in llms_txt + assert "# Contributing" not in llms_txt + assert "Help!" not in llms_txt def test_build_cleans_stale_output(self, tmp_path): readme = textwrap.dedent("""\ @@ -604,3 +633,59 @@ class TestExtractEntries: categories = [c for g in groups for c in g["categories"]] entries = extract_entries(categories, groups) assert entries[0]["source_type"] == "Built-in" + + +# --------------------------------------------------------------------------- +# annotate_entries_with_stars +# --------------------------------------------------------------------------- + + +class TestAnnotateEntriesWithStars: + def test_appends_star_count_to_bullet(self): + markdown = "- [foo](https://github.com/owner/foo) - A foo.\n" + stars = {"owner/foo": {"stars": 123, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == ( + "- [foo](https://github.com/owner/foo) - A foo. (123 GitHub stars)\n" + ) + + def test_uses_first_github_link(self): + markdown = ( + "- [foo](https://github.com/owner/foo) - A foo. " + "Also [bar](https://github.com/owner/bar).\n" + ) + stars = { + "owner/foo": {"stars": 10, "owner": "owner"}, + "owner/bar": {"stars": 99, "owner": "owner"}, + } + assert annotate_entries_with_stars(markdown, stars) == ( + "- [foo](https://github.com/owner/foo) - A foo. " + "Also [bar](https://github.com/owner/bar). (10 GitHub stars)\n" + ) + + def test_skips_entries_without_star_data(self): + markdown = "- [foo](https://github.com/owner/foo) - A foo.\n" + assert annotate_entries_with_stars(markdown, {}) == markdown + + def test_skips_non_github_links(self): + markdown = "- [foo](https://example.com) - A foo.\n" + stars = {"owner/foo": {"stars": 1, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == markdown + + def test_skips_non_bullet_lines(self): + markdown = "See [foo](https://github.com/owner/foo) for details.\n" + stars = {"owner/foo": {"stars": 1, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == markdown + + def test_handles_indented_bullets(self): + markdown = " - [foo](https://github.com/owner/foo)\n" + stars = {"owner/foo": {"stars": 7, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == ( + " - [foo](https://github.com/owner/foo) (7 GitHub stars)\n" + ) + + def test_preserves_lines_without_trailing_newline(self): + markdown = "- [foo](https://github.com/owner/foo) - A foo." + stars = {"owner/foo": {"stars": 5, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == ( + "- [foo](https://github.com/owner/foo) - A foo. (5 GitHub stars)" + )