diff --git a/.gitignore b/.gitignore index ca26a6e8..0d9f410b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,12 @@ __pycache__/ website/output/ website/data/ -# claude code -.claude/skills/ -.gstack/ -.playwright-cli/ -.superpowers/ -skills-lock.json +# planning docs +docs/ -# codex +# agents .agents/ +.claude/skills/ +.superpowers/ +.playwright-cli/ +skills-lock.json diff --git a/README.md b/README.md index 51ae9d16..107b6859 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Awesome Python -An opinionated list of Python frameworks, libraries, tools, and resources. +An opinionated guide to the best Python frameworks, libraries, tools, and resources. # **Sponsors** diff --git a/website/build.py b/website/build.py index c223ef18..8fb5f384 100644 --- a/website/build.py +++ b/website/build.py @@ -4,6 +4,8 @@ import json import re import shutil +import xml.etree.ElementTree as ET +from collections.abc import Sequence from datetime import UTC, datetime from pathlib import Path from typing import Any @@ -12,6 +14,9 @@ from jinja2 import Environment, FileSystemLoader from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") +SITE_URL = "https://awesome-python.com/" +SITEMAP_URL = f"{SITE_URL}sitemap.xml" +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" SOURCE_TYPE_DOMAINS = { "docs.python.org": "Built-in", @@ -67,6 +72,59 @@ def sort_entries(entries: list[dict]) -> list[dict]: return sorted(entries, key=sort_key) +def build_robots_txt() -> str: + return ( + "User-agent: *\n" + "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n" + "Allow: /\n" + "\n" + f"Sitemap: {SITEMAP_URL}\n" + ) + + +def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None: + ET.register_namespace("", SITEMAP_NS) + urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset") + for url, lastmod in urls: + url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url") + loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc") + loc_el.text = url + lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod") + lastmod_el.text = lastmod + + ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True) + with path.open("ab") as f: + f.write(b"\n") + + +def top_level_heading_text(line: str) -> str | None: + stripped = line.strip() + if not stripped.startswith("# "): + return None + return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip() + + +def remove_sponsors_section(markdown: str) -> str: + lines = markdown.splitlines(keepends=True) + start_idx = None + for i, line in enumerate(lines): + heading = top_level_heading_text(line) + if heading and heading.lower() == "sponsors": + start_idx = i + break + + if start_idx is None: + return markdown + + end_idx = len(lines) + for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1): + if top_level_heading_text(line): + end_idx = i + break + + return "".join(lines[:start_idx] + lines[end_idx:]) + + def extract_entries( categories: list[ParsedSection], groups: list[ParsedGroup], @@ -131,6 +189,7 @@ def build(repo_root: Path) -> None: categories = [cat for g in parsed_groups for cat in g["categories"]] total_entries = sum(c["entry_count"] for c in categories) entries = extract_entries(categories, parsed_groups) + build_date = datetime.now(UTC) stars_data = load_stars(website / "data" / "github_stars.json") @@ -155,6 +214,8 @@ def build(repo_root: Path) -> None: env = Environment( loader=FileSystemLoader(website / "templates"), autoescape=True, + trim_blocks=True, + lstrip_blocks=True, ) site_dir = website / "output" @@ -171,7 +232,7 @@ def build(repo_root: Path) -> None: total_entries=total_entries, total_categories=len(categories), repo_stars=repo_stars, - build_date=datetime.now(UTC).strftime("%B %d, %Y"), + build_date=build_date.strftime("%B %d, %Y"), sponsors=sponsors, ), encoding="utf-8", @@ -182,7 +243,11 @@ def build(repo_root: Path) -> None: if static_src.exists(): shutil.copytree(static_src, static_dst, dirs_exist_ok=True) - (site_dir / "llms.txt").write_text(readme_text, encoding="utf-8") + markdown_index = remove_sponsors_section(readme_text) + (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8") + write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())]) + (site_dir / "index.md").write_text(markdown_index, encoding="utf-8") + (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8") print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories") print(f"Total entries: {total_entries}") diff --git a/website/templates/base.html b/website/templates/base.html index 34546e73..af112095 100644 --- a/website/templates/base.html +++ b/website/templates/base.html @@ -1,26 +1,27 @@ + {% set default_meta_title = "Awesome Python" %} + {% set default_meta_description = "An opinionated guide to the best Python frameworks, libraries, and tools. Explore " ~ (entries | length) ~ " curated projects across " ~ total_categories ~ " categories, from AI and agents to data science and web development." %} + {% set canonical_url = "https://awesome-python.com/" %} + {% set social_image_url = "https://awesome-python.com/static/og-image.png" %} + {% set meta_title %}{% block title %}{{ default_meta_title }}{% endblock %}{% endset %} + {% set meta_description %}{% block description %}{{ default_meta_description }}{% endblock %}{% endset %} - {% block title %}Awesome Python{% endblock %} - - + {{ meta_title | trim }} + + + - - - - - + + + + + + + + diff --git a/website/tests/test_build.py b/website/tests/test_build.py index 0b22609a..1feab77d 100644 --- a/website/tests/test_build.py +++ b/website/tests/test_build.py @@ -3,6 +3,9 @@ import json import shutil import textwrap +import xml.etree.ElementTree as ET +from datetime import UTC, date, datetime +from html.parser import HTMLParser from pathlib import Path from build import ( @@ -15,6 +18,40 @@ from build import ( ) from readme_parser import parse_readme, slugify + +class HeadMetadataParser(HTMLParser): + def __init__(self): + super().__init__() + self.title_count = 0 + self.title = "" + self.meta_by_name = {} + self.meta_by_property = {} + self.links_by_rel = {} + self._in_title = False + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag == "title": + self.title_count += 1 + self._in_title = True + elif tag == "meta": + if "name" in attrs: + self.meta_by_name[attrs["name"]] = attrs.get("content", "") + if "property" in attrs: + self.meta_by_property[attrs["property"]] = attrs.get("content", "") + elif tag == "link" and attrs.get("rel"): + for rel in attrs["rel"].split(): + self.links_by_rel[rel] = attrs.get("href", "") + + def handle_endtag(self, tag): + if tag == "title": + self._in_title = False + + def handle_data(self, data): + if self._in_title: + self.title += data + + # --------------------------------------------------------------------------- # slugify # --------------------------------------------------------------------------- @@ -72,6 +109,11 @@ class TestBuild: encoding="utf-8", ) + def _copy_real_templates(self, tmp_path): + real_tpl = Path(__file__).parent / ".." / "templates" + tpl_dir = tmp_path / "website" / "templates" + shutil.copytree(real_tpl, tpl_dir) + def test_build_creates_single_page(self, tmp_path): readme = textwrap.dedent("""\ # Awesome Python @@ -114,6 +156,97 @@ class TestBuild: # No category sub-pages assert not (site / "categories").exists() + def test_build_creates_root_discovery_files(self, tmp_path): + readme = textwrap.dedent("""\ + # Awesome Python + + Intro. + + --- + + ## Widgets + + - [w1](https://example.com) - A widget. + + # Contributing + + Help! + """) + self._make_repo(tmp_path, readme) + start_date = datetime.now(UTC).date() + build(tmp_path) + end_date = datetime.now(UTC).date() + + site = tmp_path / "website" / "output" + robots = (site / "robots.txt").read_text(encoding="utf-8") + assert robots == ( + "User-agent: *\n" + "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n" + "Allow: /\n" + "\n" + "Sitemap: https://awesome-python.com/sitemap.xml\n" + ) + + sitemap = ET.parse(site / "sitemap.xml") + root = sitemap.getroot() + ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"} + locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)] + lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)] + + assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset" + assert locs == ["https://awesome-python.com/"] + assert len(lastmods) == 1 + assert start_date <= date.fromisoformat(lastmods[0]) <= end_date + assert all(loc.startswith("https://awesome-python.com/") for loc in locs) + assert all("?" not in loc for loc in locs) + + def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path): + readme = textwrap.dedent("""\ + # Awesome Python + + Intro. + + # **Sponsors** + + - **[Sponsor](https://sponsor.example.com)**: Sponsored tool. + + > Become a sponsor: [Sponsor us](SPONSORSHIP.md). + + # Categories + + **Tools** + + - [Widgets](#widgets) + + --- + + ## Widgets + + - [w1](https://example.com) - A widget. + + # Contributing + + Help! + """) + (tmp_path / "README.md").write_text(readme, encoding="utf-8") + self._copy_real_templates(tmp_path) + + build(tmp_path) + + site = tmp_path / "website" / "output" + index_html = (site / "index.html").read_text(encoding="utf-8") + index_md = (site / "index.md").read_text(encoding="utf-8") + llms_txt = (site / "llms.txt").read_text(encoding="utf-8") + + assert '' in index_html + assert index_md == llms_txt + assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories") + assert "# **Sponsors**" not in index_md + assert "Sponsor" not in index_md + assert "SPONSORSHIP.md" not in index_md + assert "## Widgets" in index_md + assert "- [w1](https://example.com) - A widget." in index_md + def test_build_cleans_stale_output(self, tmp_path): readme = textwrap.dedent("""\ # T @@ -235,6 +368,40 @@ class TestBuild: # Expand content present assert "expand-content" in html + def test_index_contains_aligned_homepage_metadata(self, tmp_path): + readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8") + (tmp_path / "README.md").write_text(readme, encoding="utf-8") + self._copy_real_templates(tmp_path) + + build(tmp_path) + + parsed_groups = parse_readme(readme) + categories = [cat for group in parsed_groups for cat in group["categories"]] + entries = extract_entries(categories, parsed_groups) + html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8") + parser = HeadMetadataParser() + parser.feed(html) + + expected_title = "Awesome Python" + expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development." + expected_url = "https://awesome-python.com/" + expected_image = "https://awesome-python.com/static/og-image.png" + + assert parser.title_count == 1 + assert parser.title.strip() == expected_title + assert parser.meta_by_name["description"] == expected_description + assert parser.links_by_rel["canonical"] == expected_url + assert parser.meta_by_property["og:type"] == "website" + assert parser.meta_by_property["og:title"] == expected_title + assert parser.meta_by_property["og:description"] == expected_description + assert parser.meta_by_property["og:image"] == expected_image + assert parser.meta_by_property["og:url"] == expected_url + assert parser.meta_by_name["twitter:card"] == "summary_large_image" + assert parser.meta_by_name["twitter:title"] == expected_title + assert parser.meta_by_name["twitter:description"] == expected_description + assert parser.meta_by_name["twitter:image"] == expected_image + assert "\n