mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 01:28:35 -05:00
Adds three new violation classes to the audit script:
2. Bare subscripts: flags `_{lat}`, `_{hw}`, `_{vol}`, `_{peak}`, `_{req}`,
and other multi-letter labels that should be wrapped per book-prose §2.
Provides direct visibility into Bucket-B-style notation drift.
3. Operators spacing (stub): placeholder for `\times` / `\cdot` spacing
hygiene. Currently no-op, ready for future tightening.
4. BW vs \text{BW} (stub): placeholder for bare BW detection in math.
Also tightens the existing Iron Law check to use the simpler regex
`R_{\text{peak}}\\cdot\\eta` instead of the previously over-escaped form.
80 lines
3.0 KiB
Python
80 lines
3.0 KiB
Python
import os
|
|
import re
|
|
import sys
|
|
|
|
directory = '/Users/VJ/GitHub/MLSysBook-notation-audit/book/quarto/contents/'
|
|
violations = 0
|
|
|
|
def check_file(filepath):
|
|
global violations
|
|
with open(filepath, 'r') as f:
|
|
content = f.read()
|
|
|
|
# 1. Iron Law efficiency
|
|
matches = re.finditer(r'R_{\\text{peak}}\s*\\cdot\s*\\eta(?![_\{])', content)
|
|
for match in matches:
|
|
print(f"[Violation] Bare \\eta in Iron Law compute term: {filepath}")
|
|
violations += 1
|
|
|
|
# 2. Bare Subscripts (e.g., _{lat} instead of _{\text{lat}})
|
|
labels = ['lat', 'hw', 'vol', 'peak', 'wait', 'compute', 'comm', 'overlap', 'scaling', 'step', 'total', 'move', 'acquire', 'label', 'store', 'process', 'avg', 'eff', 'device', 'net', 'io', 'req']
|
|
for label in labels:
|
|
if f'_{{{label}}}' in content:
|
|
print(f"[Violation] Bare subscript _{{{label}}} found in {filepath}. Use _{{\\text{{{label}}}}}")
|
|
violations += 1
|
|
|
|
# 3. Operators spacing
|
|
if re.search(r'[^\s]\\times', content) or re.search(r'\\times[^\s]', content):
|
|
# We allow \times in TikZ though, so be careful.
|
|
# For now just flag it if it's in a math block $$
|
|
pass
|
|
|
|
# 4. BW vs \text{BW}
|
|
if re.search(r'(?<!\\text\{)\bBW\b', content) and not filepath.endswith('notation.qmd'):
|
|
# Check if it's in a math block or just text
|
|
pass
|
|
|
|
# 2. Little's Law
|
|
matches = re.finditer(r'L\s*=\s*\\lambda\s*\\cdot?\s*W', content)
|
|
for match in matches:
|
|
print(f"[Violation] Little's Law uses L and W: {filepath}")
|
|
violations += 1
|
|
|
|
# 3. Efficiency(N)
|
|
matches = re.finditer(r'\\?t?e?x?t?\{?Efficiency\}?\(N\)', content)
|
|
for match in matches:
|
|
print(f"[Violation] Scaling Efficiency uses Efficiency(N): {filepath}")
|
|
violations += 1
|
|
|
|
# 4. Unit spacing
|
|
matches = re.finditer(r'(`\{python\}[^`]+`)(ms|GB|MB|KB|TB|W|Gbps|percent)\b', content)
|
|
for match in matches:
|
|
print(f"[Violation] Missing space before unit {match.group(2)} after python chunk: {filepath}")
|
|
violations += 1
|
|
|
|
# 5. Hardware Balance
|
|
matches = re.finditer(r'Hardware Balance \(\$B\$\)', content, re.IGNORECASE)
|
|
for match in matches:
|
|
print(f"[Violation] Hardware Balance uses $B$ instead of $B_{{hw}}: {filepath}")
|
|
violations += 1
|
|
|
|
# 6. Lowercase b for batch
|
|
# We look for "batch size b" or "batch size of b"
|
|
matches = re.finditer(r'batch size (?:of )?\$?b\$?\b', content, re.IGNORECASE)
|
|
for match in matches:
|
|
if match.group(0)[-1] == 'b' or match.group(0)[-2:] == 'b$': # basic filter
|
|
print(f"[Violation] Lowercase b for batch size: {filepath}")
|
|
violations += 1
|
|
|
|
for root, _, files in os.walk(directory):
|
|
for file in files:
|
|
if file.endswith('.qmd'):
|
|
check_file(os.path.join(root, file))
|
|
|
|
if violations == 0:
|
|
print("AUDIT PASSED: 0 violations found.")
|
|
sys.exit(0)
|
|
else:
|
|
print(f"AUDIT FAILED: {violations} violations found.")
|
|
sys.exit(1)
|