mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 01:28:35 -05:00
chore: snapshot current audit progress and infrastructure
This commit is contained in:
13
audit_results.jsonl
Normal file
13
audit_results.jsonl
Normal file
@@ -0,0 +1,13 @@
|
||||
{"qid": "cloud-0267", "track": "cloud", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "The question directly tests recall of a factual ratio (20 tokens per parameter from the Chinchilla paper) and a basic application of it, which perfectly aligns with L1 Knowledge/Recall.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "The scenario, question, and solution are completely coherent. There are no physical absurdities, the reference to Chinchilla scaling laws is accurate, and the scenario sets up the question perfectly.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0086", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "Claimed L6+ (Create) matches the requirement to design a complex multi-tier serving and routing architecture.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "Hardware bounds and swap latencies are realistic for Llama-3 70B on A100s.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0090", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "Claimed L6+ matches the design complexity of balancing real-time latency with asynchronous fairness auditing at massive scale.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "Latency estimates and data volumes for asynchronous logging are consistent and realistic.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0100", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L4 (Analyze) is appropriate for root-cause analysis of a subtle mathematical interaction between RoPE and quantization.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "The mathematical explanation of RoPE high-frequency components being zeroed out by INT8 step sizes is physically sound.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0113", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L4 (Analyze) correctly identifies the task of comparing arithmetic intensities to explain performance degradation.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "The roofline model analysis accurately reflects MoE behavior at small batch sizes where expert weights must be loaded.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0117", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L6+ (Mastery) is fitting for a design problem involving physical bandwidth limits and custom memory hierarchy management.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "PCIe Gen5 bandwidth and KV cache size calculations for 1M context are accurate.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0120", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L4 (Analyze) is appropriate for diagnosing a performance bottleneck caused by memory bandwidth limits in MoE serving.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "Throughput calculations for batch-1 MoE decoding accurately reflect the memory-bandwidth bound nature of the task.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0121", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L6+ (Mastery) matches the complex system-design task of overlapping PCIe transfers with compute for massive adapter counts.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "PCIe Gen5 bandwidth and adapter transfer times are physically realistic.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0123", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L4 (Analyze) correctly targets the trade-offs between activation checkpointing and offloading to Host DRAM.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "Calculations for 3D activation tensor sizes and PCIe transfer overhead are sound.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0126", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L4 (Analyze) is fitting for explaining the counter-intuitive performance drop when moving to theoretically 'faster' operations.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "The arithmetic intensity analysis correctly explains why depthwise convolutions become memory-bound on an NPU.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0137", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L5 (Evaluate/Create) is appropriate for analyzing a multi-tenant RAG architecture and proposing physical improvements to meet SLAs.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "KV cache size for a 70B model and PCIe Gen4 transfer rates are calculated correctly.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0154", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L4 (Analyze) is appropriate for diagnosing a hardware-level bus contention issue using latency data.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "SRAM bank contention calculations and arbitration penalties are consistent with real-world microcontroller behavior.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
{"qid": "global-0158", "track": "global", "format_compliance": "pass", "format_issues": [], "level_fit": "pass", "level_fit_rationale": "L4 (Analyze) correctly identifies the task of quantifying jitter caused by bus matrix arbitration.", "suggested_level": null, "coherence": "pass", "coherence_failure_mode": "none", "coherence_rationale": "DMA frequency and CPU stall duration calculations are physically sound.", "math_correct": "pass", "math_errors": [], "title_quality": "good", "title_suggestion": null}
|
||||
4046
cloud_published.txt
Normal file
4046
cloud_published.txt
Normal file
File diff suppressed because it is too large
Load Diff
2063
edge_published.txt
Normal file
2063
edge_published.txt
Normal file
File diff suppressed because it is too large
Load Diff
317
global_published.txt
Normal file
317
global_published.txt
Normal file
@@ -0,0 +1,317 @@
|
||||
interviews/vault/questions/global/architecture/global-0086.yaml
|
||||
interviews/vault/questions/global/architecture/global-0090.yaml
|
||||
interviews/vault/questions/global/architecture/global-0100.yaml
|
||||
interviews/vault/questions/global/architecture/global-0113.yaml
|
||||
interviews/vault/questions/global/architecture/global-0117.yaml
|
||||
interviews/vault/questions/global/architecture/global-0120.yaml
|
||||
interviews/vault/questions/global/architecture/global-0121.yaml
|
||||
interviews/vault/questions/global/architecture/global-0123.yaml
|
||||
interviews/vault/questions/global/architecture/global-0126.yaml
|
||||
interviews/vault/questions/global/architecture/global-0137.yaml
|
||||
interviews/vault/questions/global/architecture/global-0154.yaml
|
||||
interviews/vault/questions/global/architecture/global-0158.yaml
|
||||
interviews/vault/questions/global/compute/global-0003.yaml
|
||||
interviews/vault/questions/global/compute/global-0005.yaml
|
||||
interviews/vault/questions/global/compute/global-0016.yaml
|
||||
interviews/vault/questions/global/compute/global-0025.yaml
|
||||
interviews/vault/questions/global/compute/global-0026.yaml
|
||||
interviews/vault/questions/global/compute/global-0027.yaml
|
||||
interviews/vault/questions/global/compute/global-0028.yaml
|
||||
interviews/vault/questions/global/compute/global-0047.yaml
|
||||
interviews/vault/questions/global/compute/global-0048.yaml
|
||||
interviews/vault/questions/global/compute/global-0092.yaml
|
||||
interviews/vault/questions/global/compute/global-0105.yaml
|
||||
interviews/vault/questions/global/compute/global-0110.yaml
|
||||
interviews/vault/questions/global/compute/global-0112.yaml
|
||||
interviews/vault/questions/global/compute/global-0116.yaml
|
||||
interviews/vault/questions/global/compute/global-0127.yaml
|
||||
interviews/vault/questions/global/compute/global-0138.yaml
|
||||
interviews/vault/questions/global/compute/global-0140.yaml
|
||||
interviews/vault/questions/global/compute/global-0146.yaml
|
||||
interviews/vault/questions/global/compute/global-0200.yaml
|
||||
interviews/vault/questions/global/compute/global-0207.yaml
|
||||
interviews/vault/questions/global/compute/global-0212.yaml
|
||||
interviews/vault/questions/global/compute/global-0221.yaml
|
||||
interviews/vault/questions/global/compute/global-0226.yaml
|
||||
interviews/vault/questions/global/compute/global-0304.yaml
|
||||
interviews/vault/questions/global/compute/global-0312.yaml
|
||||
interviews/vault/questions/global/compute/global-0319.yaml
|
||||
interviews/vault/questions/global/compute/global-0375.yaml
|
||||
interviews/vault/questions/global/compute/global-0378.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0052.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0053.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0173.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0177.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0187.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0192.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0196.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0206.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0214.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0232.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0257.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0267.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0275.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0301.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0307.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0314.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0317.yaml
|
||||
interviews/vault/questions/global/cross-cutting/global-0383.yaml
|
||||
interviews/vault/questions/global/data/global-0010.yaml
|
||||
interviews/vault/questions/global/data/global-0043.yaml
|
||||
interviews/vault/questions/global/data/global-0044.yaml
|
||||
interviews/vault/questions/global/data/global-0045.yaml
|
||||
interviews/vault/questions/global/data/global-0046.yaml
|
||||
interviews/vault/questions/global/data/global-0067.yaml
|
||||
interviews/vault/questions/global/data/global-0068.yaml
|
||||
interviews/vault/questions/global/data/global-0091.yaml
|
||||
interviews/vault/questions/global/data/global-0108.yaml
|
||||
interviews/vault/questions/global/data/global-0122.yaml
|
||||
interviews/vault/questions/global/data/global-0274.yaml
|
||||
interviews/vault/questions/global/data/global-0293.yaml
|
||||
interviews/vault/questions/global/data/global-0339.yaml
|
||||
interviews/vault/questions/global/deployment/global-0014.yaml
|
||||
interviews/vault/questions/global/deployment/global-0034.yaml
|
||||
interviews/vault/questions/global/deployment/global-0035.yaml
|
||||
interviews/vault/questions/global/deployment/global-0036.yaml
|
||||
interviews/vault/questions/global/deployment/global-0037.yaml
|
||||
interviews/vault/questions/global/deployment/global-0038.yaml
|
||||
interviews/vault/questions/global/deployment/global-0039.yaml
|
||||
interviews/vault/questions/global/deployment/global-0040.yaml
|
||||
interviews/vault/questions/global/deployment/global-0041.yaml
|
||||
interviews/vault/questions/global/deployment/global-0042.yaml
|
||||
interviews/vault/questions/global/deployment/global-0089.yaml
|
||||
interviews/vault/questions/global/deployment/global-0109.yaml
|
||||
interviews/vault/questions/global/deployment/global-0175.yaml
|
||||
interviews/vault/questions/global/deployment/global-0179.yaml
|
||||
interviews/vault/questions/global/deployment/global-0185.yaml
|
||||
interviews/vault/questions/global/deployment/global-0189.yaml
|
||||
interviews/vault/questions/global/deployment/global-0195.yaml
|
||||
interviews/vault/questions/global/deployment/global-0198.yaml
|
||||
interviews/vault/questions/global/deployment/global-0224.yaml
|
||||
interviews/vault/questions/global/deployment/global-0245.yaml
|
||||
interviews/vault/questions/global/deployment/global-0284.yaml
|
||||
interviews/vault/questions/global/deployment/global-0297.yaml
|
||||
interviews/vault/questions/global/deployment/global-0305.yaml
|
||||
interviews/vault/questions/global/deployment/global-0309.yaml
|
||||
interviews/vault/questions/global/deployment/global-0313.yaml
|
||||
interviews/vault/questions/global/deployment/global-0401.yaml
|
||||
interviews/vault/questions/global/deployment/global-0440.yaml
|
||||
interviews/vault/questions/global/latency/global-0057.yaml
|
||||
interviews/vault/questions/global/latency/global-0087.yaml
|
||||
interviews/vault/questions/global/latency/global-0097.yaml
|
||||
interviews/vault/questions/global/latency/global-0133.yaml
|
||||
interviews/vault/questions/global/latency/global-0139.yaml
|
||||
interviews/vault/questions/global/latency/global-0143.yaml
|
||||
interviews/vault/questions/global/latency/global-0178.yaml
|
||||
interviews/vault/questions/global/latency/global-0180.yaml
|
||||
interviews/vault/questions/global/latency/global-0184.yaml
|
||||
interviews/vault/questions/global/latency/global-0186.yaml
|
||||
interviews/vault/questions/global/latency/global-0199.yaml
|
||||
interviews/vault/questions/global/latency/global-0209.yaml
|
||||
interviews/vault/questions/global/latency/global-0225.yaml
|
||||
interviews/vault/questions/global/latency/global-0238.yaml
|
||||
interviews/vault/questions/global/latency/global-0243.yaml
|
||||
interviews/vault/questions/global/latency/global-0251.yaml
|
||||
interviews/vault/questions/global/latency/global-0256.yaml
|
||||
interviews/vault/questions/global/latency/global-0258.yaml
|
||||
interviews/vault/questions/global/latency/global-0259.yaml
|
||||
interviews/vault/questions/global/latency/global-0261.yaml
|
||||
interviews/vault/questions/global/latency/global-0265.yaml
|
||||
interviews/vault/questions/global/latency/global-0268.yaml
|
||||
interviews/vault/questions/global/latency/global-0272.yaml
|
||||
interviews/vault/questions/global/latency/global-0273.yaml
|
||||
interviews/vault/questions/global/latency/global-0276.yaml
|
||||
interviews/vault/questions/global/latency/global-0278.yaml
|
||||
interviews/vault/questions/global/latency/global-0279.yaml
|
||||
interviews/vault/questions/global/latency/global-0281.yaml
|
||||
interviews/vault/questions/global/latency/global-0291.yaml
|
||||
interviews/vault/questions/global/latency/global-0294.yaml
|
||||
interviews/vault/questions/global/latency/global-0300.yaml
|
||||
interviews/vault/questions/global/latency/global-0308.yaml
|
||||
interviews/vault/questions/global/latency/global-0432.yaml
|
||||
interviews/vault/questions/global/latency/global-0435.yaml
|
||||
interviews/vault/questions/global/memory/global-0000.yaml
|
||||
interviews/vault/questions/global/memory/global-0002.yaml
|
||||
interviews/vault/questions/global/memory/global-0007.yaml
|
||||
interviews/vault/questions/global/memory/global-0013.yaml
|
||||
interviews/vault/questions/global/memory/global-0019.yaml
|
||||
interviews/vault/questions/global/memory/global-0020.yaml
|
||||
interviews/vault/questions/global/memory/global-0021.yaml
|
||||
interviews/vault/questions/global/memory/global-0022.yaml
|
||||
interviews/vault/questions/global/memory/global-0023.yaml
|
||||
interviews/vault/questions/global/memory/global-0054.yaml
|
||||
interviews/vault/questions/global/memory/global-0058.yaml
|
||||
interviews/vault/questions/global/memory/global-0059.yaml
|
||||
interviews/vault/questions/global/memory/global-0060.yaml
|
||||
interviews/vault/questions/global/memory/global-0062.yaml
|
||||
interviews/vault/questions/global/memory/global-0098.yaml
|
||||
interviews/vault/questions/global/memory/global-0102.yaml
|
||||
interviews/vault/questions/global/memory/global-0104.yaml
|
||||
interviews/vault/questions/global/memory/global-0114.yaml
|
||||
interviews/vault/questions/global/memory/global-0118.yaml
|
||||
interviews/vault/questions/global/memory/global-0128.yaml
|
||||
interviews/vault/questions/global/memory/global-0130.yaml
|
||||
interviews/vault/questions/global/memory/global-0132.yaml
|
||||
interviews/vault/questions/global/memory/global-0135.yaml
|
||||
interviews/vault/questions/global/memory/global-0141.yaml
|
||||
interviews/vault/questions/global/memory/global-0144.yaml
|
||||
interviews/vault/questions/global/memory/global-0148.yaml
|
||||
interviews/vault/questions/global/memory/global-0151.yaml
|
||||
interviews/vault/questions/global/memory/global-0156.yaml
|
||||
interviews/vault/questions/global/memory/global-0157.yaml
|
||||
interviews/vault/questions/global/memory/global-0162.yaml
|
||||
interviews/vault/questions/global/memory/global-0201.yaml
|
||||
interviews/vault/questions/global/memory/global-0208.yaml
|
||||
interviews/vault/questions/global/memory/global-0217.yaml
|
||||
interviews/vault/questions/global/memory/global-0227.yaml
|
||||
interviews/vault/questions/global/memory/global-0234.yaml
|
||||
interviews/vault/questions/global/memory/global-0235.yaml
|
||||
interviews/vault/questions/global/memory/global-0241.yaml
|
||||
interviews/vault/questions/global/memory/global-0250.yaml
|
||||
interviews/vault/questions/global/memory/global-0252.yaml
|
||||
interviews/vault/questions/global/memory/global-0260.yaml
|
||||
interviews/vault/questions/global/memory/global-0263.yaml
|
||||
interviews/vault/questions/global/memory/global-0270.yaml
|
||||
interviews/vault/questions/global/memory/global-0285.yaml
|
||||
interviews/vault/questions/global/memory/global-0296.yaml
|
||||
interviews/vault/questions/global/memory/global-0434.yaml
|
||||
interviews/vault/questions/global/networking/global-0009.yaml
|
||||
interviews/vault/questions/global/networking/global-0017.yaml
|
||||
interviews/vault/questions/global/networking/global-0030.yaml
|
||||
interviews/vault/questions/global/networking/global-0065.yaml
|
||||
interviews/vault/questions/global/networking/global-0066.yaml
|
||||
interviews/vault/questions/global/networking/global-0074.yaml
|
||||
interviews/vault/questions/global/networking/global-0075.yaml
|
||||
interviews/vault/questions/global/networking/global-0076.yaml
|
||||
interviews/vault/questions/global/networking/global-0077.yaml
|
||||
interviews/vault/questions/global/networking/global-0078.yaml
|
||||
interviews/vault/questions/global/networking/global-0079.yaml
|
||||
interviews/vault/questions/global/networking/global-0080.yaml
|
||||
interviews/vault/questions/global/networking/global-0081.yaml
|
||||
interviews/vault/questions/global/networking/global-0082.yaml
|
||||
interviews/vault/questions/global/networking/global-0083.yaml
|
||||
interviews/vault/questions/global/networking/global-0084.yaml
|
||||
interviews/vault/questions/global/networking/global-0085.yaml
|
||||
interviews/vault/questions/global/networking/global-0093.yaml
|
||||
interviews/vault/questions/global/networking/global-0107.yaml
|
||||
interviews/vault/questions/global/networking/global-0136.yaml
|
||||
interviews/vault/questions/global/networking/global-0147.yaml
|
||||
interviews/vault/questions/global/networking/global-0161.yaml
|
||||
interviews/vault/questions/global/networking/global-0204.yaml
|
||||
interviews/vault/questions/global/networking/global-0205.yaml
|
||||
interviews/vault/questions/global/networking/global-0218.yaml
|
||||
interviews/vault/questions/global/networking/global-0231.yaml
|
||||
interviews/vault/questions/global/networking/global-0244.yaml
|
||||
interviews/vault/questions/global/networking/global-0254.yaml
|
||||
interviews/vault/questions/global/networking/global-0264.yaml
|
||||
interviews/vault/questions/global/networking/global-0286.yaml
|
||||
interviews/vault/questions/global/networking/global-0335.yaml
|
||||
interviews/vault/questions/global/networking/global-0437.yaml
|
||||
interviews/vault/questions/global/optimization/global-0015.yaml
|
||||
interviews/vault/questions/global/optimization/global-0049.yaml
|
||||
interviews/vault/questions/global/optimization/global-0050.yaml
|
||||
interviews/vault/questions/global/optimization/global-0051.yaml
|
||||
interviews/vault/questions/global/optimization/global-0055.yaml
|
||||
interviews/vault/questions/global/optimization/global-0061.yaml
|
||||
interviews/vault/questions/global/optimization/global-0063.yaml
|
||||
interviews/vault/questions/global/optimization/global-0064.yaml
|
||||
interviews/vault/questions/global/optimization/global-0094.yaml
|
||||
interviews/vault/questions/global/optimization/global-0101.yaml
|
||||
interviews/vault/questions/global/optimization/global-0106.yaml
|
||||
interviews/vault/questions/global/optimization/global-0115.yaml
|
||||
interviews/vault/questions/global/optimization/global-0119.yaml
|
||||
interviews/vault/questions/global/optimization/global-0124.yaml
|
||||
interviews/vault/questions/global/optimization/global-0125.yaml
|
||||
interviews/vault/questions/global/optimization/global-0134.yaml
|
||||
interviews/vault/questions/global/optimization/global-0142.yaml
|
||||
interviews/vault/questions/global/optimization/global-0145.yaml
|
||||
interviews/vault/questions/global/optimization/global-0152.yaml
|
||||
interviews/vault/questions/global/optimization/global-0210.yaml
|
||||
interviews/vault/questions/global/optimization/global-0223.yaml
|
||||
interviews/vault/questions/global/optimization/global-0237.yaml
|
||||
interviews/vault/questions/global/optimization/global-0240.yaml
|
||||
interviews/vault/questions/global/optimization/global-0248.yaml
|
||||
interviews/vault/questions/global/optimization/global-0302.yaml
|
||||
interviews/vault/questions/global/optimization/global-0310.yaml
|
||||
interviews/vault/questions/global/optimization/global-0315.yaml
|
||||
interviews/vault/questions/global/optimization/global-0438.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0018.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0032.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0215.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0229.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0242.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0255.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0262.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0266.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0271.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0280.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0282.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0283.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0292.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0295.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0299.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0303.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0311.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0318.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0374.yaml
|
||||
interviews/vault/questions/global/parallelism/global-0397.yaml
|
||||
interviews/vault/questions/global/power/global-0001.yaml
|
||||
interviews/vault/questions/global/power/global-0006.yaml
|
||||
interviews/vault/questions/global/power/global-0011.yaml
|
||||
interviews/vault/questions/global/power/global-0056.yaml
|
||||
interviews/vault/questions/global/power/global-0129.yaml
|
||||
interviews/vault/questions/global/power/global-0149.yaml
|
||||
interviews/vault/questions/global/power/global-0155.yaml
|
||||
interviews/vault/questions/global/power/global-0163.yaml
|
||||
interviews/vault/questions/global/power/global-0164.yaml
|
||||
interviews/vault/questions/global/power/global-0165.yaml
|
||||
interviews/vault/questions/global/power/global-0166.yaml
|
||||
interviews/vault/questions/global/power/global-0167.yaml
|
||||
interviews/vault/questions/global/power/global-0168.yaml
|
||||
interviews/vault/questions/global/power/global-0169.yaml
|
||||
interviews/vault/questions/global/power/global-0174.yaml
|
||||
interviews/vault/questions/global/power/global-0176.yaml
|
||||
interviews/vault/questions/global/power/global-0181.yaml
|
||||
interviews/vault/questions/global/power/global-0183.yaml
|
||||
interviews/vault/questions/global/power/global-0188.yaml
|
||||
interviews/vault/questions/global/power/global-0191.yaml
|
||||
interviews/vault/questions/global/power/global-0194.yaml
|
||||
interviews/vault/questions/global/power/global-0197.yaml
|
||||
interviews/vault/questions/global/power/global-0203.yaml
|
||||
interviews/vault/questions/global/power/global-0211.yaml
|
||||
interviews/vault/questions/global/power/global-0220.yaml
|
||||
interviews/vault/questions/global/power/global-0233.yaml
|
||||
interviews/vault/questions/global/power/global-0239.yaml
|
||||
interviews/vault/questions/global/power/global-0249.yaml
|
||||
interviews/vault/questions/global/power/global-0253.yaml
|
||||
interviews/vault/questions/global/power/global-0269.yaml
|
||||
interviews/vault/questions/global/power/global-0289.yaml
|
||||
interviews/vault/questions/global/power/global-0298.yaml
|
||||
interviews/vault/questions/global/precision/global-0004.yaml
|
||||
interviews/vault/questions/global/precision/global-0095.yaml
|
||||
interviews/vault/questions/global/precision/global-0096.yaml
|
||||
interviews/vault/questions/global/precision/global-0099.yaml
|
||||
interviews/vault/questions/global/precision/global-0103.yaml
|
||||
interviews/vault/questions/global/precision/global-0111.yaml
|
||||
interviews/vault/questions/global/precision/global-0131.yaml
|
||||
interviews/vault/questions/global/precision/global-0193.yaml
|
||||
interviews/vault/questions/global/precision/global-0202.yaml
|
||||
interviews/vault/questions/global/precision/global-0213.yaml
|
||||
interviews/vault/questions/global/precision/global-0228.yaml
|
||||
interviews/vault/questions/global/precision/global-0246.yaml
|
||||
interviews/vault/questions/global/precision/global-0358.yaml
|
||||
interviews/vault/questions/global/reliability/global-0069.yaml
|
||||
interviews/vault/questions/global/reliability/global-0070.yaml
|
||||
interviews/vault/questions/global/reliability/global-0071.yaml
|
||||
interviews/vault/questions/global/reliability/global-0072.yaml
|
||||
interviews/vault/questions/global/reliability/global-0073.yaml
|
||||
interviews/vault/questions/global/reliability/global-0088.yaml
|
||||
interviews/vault/questions/global/reliability/global-0153.yaml
|
||||
interviews/vault/questions/global/reliability/global-0160.yaml
|
||||
interviews/vault/questions/global/reliability/global-0216.yaml
|
||||
interviews/vault/questions/global/reliability/global-0230.yaml
|
||||
interviews/vault/questions/global/reliability/global-0236.yaml
|
||||
interviews/vault/questions/global/reliability/global-0247.yaml
|
||||
interviews/vault/questions/global/reliability/global-0362.yaml
|
||||
interviews/vault/questions/global/reliability/global-0421.yaml
|
||||
interviews/vault/questions/global/reliability/global-0436.yaml
|
||||
@@ -0,0 +1,107 @@
|
||||
You are auditing the StaffML ML-systems interview corpus. Each item is a YAML
|
||||
file under `interviews/vault/questions/<track>/<area>/<id>.yaml`. Audit only
|
||||
files where `status: published`.
|
||||
|
||||
OUTPUT TARGET (write here, append, one JSON object per line):
|
||||
`audit_results.jsonl`
|
||||
Create the directory if it doesn't exist. If the file already exists, read it
|
||||
first, collect the set of qids already audited, and SKIP those — this lets
|
||||
the run resume after an interruption.
|
||||
|
||||
WORK PLAN
|
||||
1. Read the list of published YAML files from `cloud_published.txt`. Track them
|
||||
in lexical order (sorted by track, then area, then qid).
|
||||
2. For each unaudited published file:
|
||||
a. Read the YAML. Extract: id, track, level, zone, topic, competency_area,
|
||||
title, scenario, question (if present), and the entire `details` block
|
||||
(realistic_solution, common_mistake, napkin_math, options, correct_index).
|
||||
b. Run the five gates below.
|
||||
c. Append a single JSON record to the output file (with a trailing newline).
|
||||
3. Every 25 questions, print a one-line progress update to stdout:
|
||||
`progress: <N>/<TOTAL> · pass=<P> fail=<F> · current=<qid>`.
|
||||
4. When done, print a summary block: per-gate pass/fail counts, per-track
|
||||
totals, top 10 failure rationales by frequency.
|
||||
|
||||
THE FIVE GATES
|
||||
|
||||
Gate A — format_compliance
|
||||
common_mistake (when non-empty) must contain in order:
|
||||
"**The Pitfall:**" "**The Rationale:**" "**The Consequence:**"
|
||||
napkin_math (when non-empty) must contain in order:
|
||||
"**Assumptions" (or "**Assumptions & Constraints:**")
|
||||
"**Calculations:**"
|
||||
"**Conclusion" (or "**Conclusion & Interpretation:**")
|
||||
Verdict: pass | fail · with `format_issues: [<missing markers>]` on fail.
|
||||
|
||||
Gate B — level_fit
|
||||
The `level` field claims a Bloom-mapped depth (L1=Remember .. L6+=Create
|
||||
Staff-level). Read the question + scenario + realistic_solution and judge
|
||||
whether the claimed level matches what the question actually demands.
|
||||
Verdict: pass | fail
|
||||
On fail: `level_fit_rationale` (1-2 sentences), `suggested_level` (e.g. "L3").
|
||||
|
||||
Gate C — coherence
|
||||
Reject (verdict=fail) on any of:
|
||||
1. PHYSICAL ABSURDITY: hardware/software numbers violate real-world
|
||||
bounds (e.g., NPU wake-up >50ms, smartphone pulling 50W, latency
|
||||
>5× off realistic for the named hardware).
|
||||
2. VENDOR-NAME FABRICATION: hardware/framework/benchmark names that
|
||||
don't exist or are misattributed (e.g., "Coral Edge TPU XL" — no XL
|
||||
variant). Treat ambiguous-but-plausible as ok; flag clearly invented.
|
||||
3. SCENARIO/QUESTION/SOLUTION MISMATCH: question doesn't follow from
|
||||
scenario, realistic_solution doesn't actually answer the question,
|
||||
or numbers contradict across fields.
|
||||
4. ARITHMETIC IN SCENARIO: scenario contains a stated calculation that
|
||||
is wrong on its face (this is separate from gate D's napkin math).
|
||||
Verdict: pass | fail · `coherence_failure_mode` (one of: physical-absurdity,
|
||||
vendor-fabrication, mismatch, scenario-arithmetic, none) · `coherence_rationale`.
|
||||
|
||||
Gate D — math_correct
|
||||
Independently re-derive the napkin_math calculations. Are the assumptions
|
||||
sound? Do the unit conversions check out? Does the conclusion follow?
|
||||
Verdict: pass | fail · `math_errors: [<short error list>]` on fail.
|
||||
|
||||
Gate E — title_quality
|
||||
Title (≤120 chars, plaintext, no LaTeX, no markdown, no underscores).
|
||||
Verdicts:
|
||||
good — descriptive, concrete, names the operative concept
|
||||
generic — too vague to retrieve ("Cloud Q1", "Memory Question")
|
||||
placeholder — clearly an unfilled placeholder ("TODO", "draft", "x")
|
||||
On non-good: `title_suggestion` if you can produce a short concrete one.
|
||||
|
||||
OUTPUT JSON SHAPE (one per line in `01_audit.jsonl`)
|
||||
|
||||
{
|
||||
"qid": "cloud-2297",
|
||||
"track": "cloud",
|
||||
"format_compliance": "pass" | "fail",
|
||||
"format_issues": [],
|
||||
"level_fit": "pass" | "fail",
|
||||
"level_fit_rationale": "...",
|
||||
"suggested_level": "L4" | null,
|
||||
"coherence": "pass" | "fail",
|
||||
"coherence_failure_mode": "none" | "physical-absurdity" | ...,
|
||||
"coherence_rationale": "...",
|
||||
"math_correct": "pass" | "fail",
|
||||
"math_errors": [],
|
||||
"title_quality": "good" | "generic" | "placeholder",
|
||||
"title_suggestion": null
|
||||
}
|
||||
|
||||
CRITICAL RULES
|
||||
|
||||
- Append only. Do not rewrite the file. Each batch you complete should be
|
||||
durable on disk so a kill-9 mid-run loses at most one item.
|
||||
- Do not modify any YAML. This is read-only audit; corrections are a
|
||||
downstream task.
|
||||
- Skip non-published statuses. Do not audit drafts, flagged, deleted,
|
||||
or archived.
|
||||
- Process at least 200 items per session. Print progress every 25.
|
||||
- If you encounter a YAML you can't parse, write a record with
|
||||
`qid: "<filename-stem>"` and all gates `error`, plus `_reason: "..."`.
|
||||
- If you hit a tool / network error, write what you have so far, then
|
||||
print `STOPPING: <reason>` and exit cleanly. Do not crash.
|
||||
|
||||
START NOW. First action: read the existing
|
||||
`audit_results.jsonl` (or note
|
||||
that it doesn't exist), then list published YAMLs.
|
||||
@@ -0,0 +1,107 @@
|
||||
You are auditing the StaffML ML-systems interview corpus. Each item is a YAML
|
||||
file under `interviews/vault/questions/<track>/<area>/<id>.yaml`. Audit only
|
||||
files where `status: published`.
|
||||
|
||||
OUTPUT TARGET (write here, append, one JSON object per line):
|
||||
`audit_results.jsonl`
|
||||
Create the directory if it doesn't exist. If the file already exists, read it
|
||||
first, collect the set of qids already audited, and SKIP those — this lets
|
||||
the run resume after an interruption.
|
||||
|
||||
WORK PLAN
|
||||
1. Read the list of published YAML files from `edge_published.txt`. Track them
|
||||
in lexical order (sorted by track, then area, then qid).
|
||||
2. For each unaudited published file:
|
||||
a. Read the YAML. Extract: id, track, level, zone, topic, competency_area,
|
||||
title, scenario, question (if present), and the entire `details` block
|
||||
(realistic_solution, common_mistake, napkin_math, options, correct_index).
|
||||
b. Run the five gates below.
|
||||
c. Append a single JSON record to the output file (with a trailing newline).
|
||||
3. Every 25 questions, print a one-line progress update to stdout:
|
||||
`progress: <N>/<TOTAL> · pass=<P> fail=<F> · current=<qid>`.
|
||||
4. When done, print a summary block: per-gate pass/fail counts, per-track
|
||||
totals, top 10 failure rationales by frequency.
|
||||
|
||||
THE FIVE GATES
|
||||
|
||||
Gate A — format_compliance
|
||||
common_mistake (when non-empty) must contain in order:
|
||||
"**The Pitfall:**" "**The Rationale:**" "**The Consequence:**"
|
||||
napkin_math (when non-empty) must contain in order:
|
||||
"**Assumptions" (or "**Assumptions & Constraints:**")
|
||||
"**Calculations:**"
|
||||
"**Conclusion" (or "**Conclusion & Interpretation:**")
|
||||
Verdict: pass | fail · with `format_issues: [<missing markers>]` on fail.
|
||||
|
||||
Gate B — level_fit
|
||||
The `level` field claims a Bloom-mapped depth (L1=Remember .. L6+=Create
|
||||
Staff-level). Read the question + scenario + realistic_solution and judge
|
||||
whether the claimed level matches what the question actually demands.
|
||||
Verdict: pass | fail
|
||||
On fail: `level_fit_rationale` (1-2 sentences), `suggested_level` (e.g. "L3").
|
||||
|
||||
Gate C — coherence
|
||||
Reject (verdict=fail) on any of:
|
||||
1. PHYSICAL ABSURDITY: hardware/software numbers violate real-world
|
||||
bounds (e.g., NPU wake-up >50ms, smartphone pulling 50W, latency
|
||||
>5× off realistic for the named hardware).
|
||||
2. VENDOR-NAME FABRICATION: hardware/framework/benchmark names that
|
||||
don't exist or are misattributed (e.g., "Coral Edge TPU XL" — no XL
|
||||
variant). Treat ambiguous-but-plausible as ok; flag clearly invented.
|
||||
3. SCENARIO/QUESTION/SOLUTION MISMATCH: question doesn't follow from
|
||||
scenario, realistic_solution doesn't actually answer the question,
|
||||
or numbers contradict across fields.
|
||||
4. ARITHMETIC IN SCENARIO: scenario contains a stated calculation that
|
||||
is wrong on its face (this is separate from gate D's napkin math).
|
||||
Verdict: pass | fail · `coherence_failure_mode` (one of: physical-absurdity,
|
||||
vendor-fabrication, mismatch, scenario-arithmetic, none) · `coherence_rationale`.
|
||||
|
||||
Gate D — math_correct
|
||||
Independently re-derive the napkin_math calculations. Are the assumptions
|
||||
sound? Do the unit conversions check out? Does the conclusion follow?
|
||||
Verdict: pass | fail · `math_errors: [<short error list>]` on fail.
|
||||
|
||||
Gate E — title_quality
|
||||
Title (≤120 chars, plaintext, no LaTeX, no markdown, no underscores).
|
||||
Verdicts:
|
||||
good — descriptive, concrete, names the operative concept
|
||||
generic — too vague to retrieve ("Cloud Q1", "Memory Question")
|
||||
placeholder — clearly an unfilled placeholder ("TODO", "draft", "x")
|
||||
On non-good: `title_suggestion` if you can produce a short concrete one.
|
||||
|
||||
OUTPUT JSON SHAPE (one per line in `01_audit.jsonl`)
|
||||
|
||||
{
|
||||
"qid": "cloud-2297",
|
||||
"track": "cloud",
|
||||
"format_compliance": "pass" | "fail",
|
||||
"format_issues": [],
|
||||
"level_fit": "pass" | "fail",
|
||||
"level_fit_rationale": "...",
|
||||
"suggested_level": "L4" | null,
|
||||
"coherence": "pass" | "fail",
|
||||
"coherence_failure_mode": "none" | "physical-absurdity" | ...,
|
||||
"coherence_rationale": "...",
|
||||
"math_correct": "pass" | "fail",
|
||||
"math_errors": [],
|
||||
"title_quality": "good" | "generic" | "placeholder",
|
||||
"title_suggestion": null
|
||||
}
|
||||
|
||||
CRITICAL RULES
|
||||
|
||||
- Append only. Do not rewrite the file. Each batch you complete should be
|
||||
durable on disk so a kill-9 mid-run loses at most one item.
|
||||
- Do not modify any YAML. This is read-only audit; corrections are a
|
||||
downstream task.
|
||||
- Skip non-published statuses. Do not audit drafts, flagged, deleted,
|
||||
or archived.
|
||||
- Process at least 200 items per session. Print progress every 25.
|
||||
- If you encounter a YAML you can't parse, write a record with
|
||||
`qid: "<filename-stem>"` and all gates `error`, plus `_reason: "..."`.
|
||||
- If you hit a tool / network error, write what you have so far, then
|
||||
print `STOPPING: <reason>` and exit cleanly. Do not crash.
|
||||
|
||||
START NOW. First action: read the existing
|
||||
`audit_results.jsonl` (or note
|
||||
that it doesn't exist), then list published YAMLs.
|
||||
@@ -0,0 +1,107 @@
|
||||
You are auditing the StaffML ML-systems interview corpus. Each item is a YAML
|
||||
file under `interviews/vault/questions/<track>/<area>/<id>.yaml`. Audit only
|
||||
files where `status: published`.
|
||||
|
||||
OUTPUT TARGET (write here, append, one JSON object per line):
|
||||
`global_test.jsonl`
|
||||
Create the directory if it doesn't exist. If the file already exists, read it
|
||||
first, collect the set of qids already audited, and SKIP those — this lets
|
||||
the run resume after an interruption.
|
||||
|
||||
WORK PLAN
|
||||
1. Read the list of published YAML files from `global_published.txt`. Track them
|
||||
in lexical order (sorted by track, then area, then qid).
|
||||
2. For each unaudited published file:
|
||||
a. Read the YAML. Extract: id, track, level, zone, topic, competency_area,
|
||||
title, scenario, question (if present), and the entire `details` block
|
||||
(realistic_solution, common_mistake, napkin_math, options, correct_index).
|
||||
b. Run the five gates below.
|
||||
c. Append a single JSON record to the output file (with a trailing newline).
|
||||
3. Every 25 questions, print a one-line progress update to stdout:
|
||||
`progress: <N>/<TOTAL> · pass=<P> fail=<F> · current=<qid>`.
|
||||
4. When done, print a summary block: per-gate pass/fail counts, per-track
|
||||
totals, top 10 failure rationales by frequency.
|
||||
|
||||
THE FIVE GATES
|
||||
|
||||
Gate A — format_compliance
|
||||
common_mistake (when non-empty) must contain in order:
|
||||
"**The Pitfall:**" "**The Rationale:**" "**The Consequence:**"
|
||||
napkin_math (when non-empty) must contain in order:
|
||||
"**Assumptions" (or "**Assumptions & Constraints:**")
|
||||
"**Calculations:**"
|
||||
"**Conclusion" (or "**Conclusion & Interpretation:**")
|
||||
Verdict: pass | fail · with `format_issues: [<missing markers>]` on fail.
|
||||
|
||||
Gate B — level_fit
|
||||
The `level` field claims a Bloom-mapped depth (L1=Remember .. L6+=Create
|
||||
Staff-level). Read the question + scenario + realistic_solution and judge
|
||||
whether the claimed level matches what the question actually demands.
|
||||
Verdict: pass | fail
|
||||
On fail: `level_fit_rationale` (1-2 sentences), `suggested_level` (e.g. "L3").
|
||||
|
||||
Gate C — coherence
|
||||
Reject (verdict=fail) on any of:
|
||||
1. PHYSICAL ABSURDITY: hardware/software numbers violate real-world
|
||||
bounds (e.g., NPU wake-up >50ms, smartphone pulling 50W, latency
|
||||
>5× off realistic for the named hardware).
|
||||
2. VENDOR-NAME FABRICATION: hardware/framework/benchmark names that
|
||||
don't exist or are misattributed (e.g., "Coral Edge TPU XL" — no XL
|
||||
variant). Treat ambiguous-but-plausible as ok; flag clearly invented.
|
||||
3. SCENARIO/QUESTION/SOLUTION MISMATCH: question doesn't follow from
|
||||
scenario, realistic_solution doesn't actually answer the question,
|
||||
or numbers contradict across fields.
|
||||
4. ARITHMETIC IN SCENARIO: scenario contains a stated calculation that
|
||||
is wrong on its face (this is separate from gate D's napkin math).
|
||||
Verdict: pass | fail · `coherence_failure_mode` (one of: physical-absurdity,
|
||||
vendor-fabrication, mismatch, scenario-arithmetic, none) · `coherence_rationale`.
|
||||
|
||||
Gate D — math_correct
|
||||
Independently re-derive the napkin_math calculations. Are the assumptions
|
||||
sound? Do the unit conversions check out? Does the conclusion follow?
|
||||
Verdict: pass | fail · `math_errors: [<short error list>]` on fail.
|
||||
|
||||
Gate E — title_quality
|
||||
Title (≤120 chars, plaintext, no LaTeX, no markdown, no underscores).
|
||||
Verdicts:
|
||||
good — descriptive, concrete, names the operative concept
|
||||
generic — too vague to retrieve ("Cloud Q1", "Memory Question")
|
||||
placeholder — clearly an unfilled placeholder ("TODO", "draft", "x")
|
||||
On non-good: `title_suggestion` if you can produce a short concrete one.
|
||||
|
||||
OUTPUT JSON SHAPE (one per line in `01_audit.jsonl`)
|
||||
|
||||
{
|
||||
"qid": "cloud-2297",
|
||||
"track": "cloud",
|
||||
"format_compliance": "pass" | "fail",
|
||||
"format_issues": [],
|
||||
"level_fit": "pass" | "fail",
|
||||
"level_fit_rationale": "...",
|
||||
"suggested_level": "L4" | null,
|
||||
"coherence": "pass" | "fail",
|
||||
"coherence_failure_mode": "none" | "physical-absurdity" | ...,
|
||||
"coherence_rationale": "...",
|
||||
"math_correct": "pass" | "fail",
|
||||
"math_errors": [],
|
||||
"title_quality": "good" | "generic" | "placeholder",
|
||||
"title_suggestion": null
|
||||
}
|
||||
|
||||
CRITICAL RULES
|
||||
|
||||
- Append only. Do not rewrite the file. Each batch you complete should be
|
||||
durable on disk so a kill-9 mid-run loses at most one item.
|
||||
- Do not modify any YAML. This is read-only audit; corrections are a
|
||||
downstream task.
|
||||
- Skip non-published statuses. Do not audit drafts, flagged, deleted,
|
||||
or archived.
|
||||
- Process at least 200 items per session. Print progress every 25.
|
||||
- If you encounter a YAML you can't parse, write a record with
|
||||
`qid: "<filename-stem>"` and all gates `error`, plus `_reason: "..."`.
|
||||
- If you hit a tool / network error, write what you have so far, then
|
||||
print `STOPPING: <reason>` and exit cleanly. Do not crash.
|
||||
|
||||
START NOW. First action: read the existing
|
||||
`global_test.jsonl` (or note
|
||||
that it doesn't exist), then list published YAMLs.
|
||||
@@ -0,0 +1,107 @@
|
||||
You are auditing the StaffML ML-systems interview corpus. Each item is a YAML
|
||||
file under `interviews/vault/questions/<track>/<area>/<id>.yaml`. Audit only
|
||||
files where `status: published`.
|
||||
|
||||
OUTPUT TARGET (write here, append, one JSON object per line):
|
||||
`audit_results.jsonl`
|
||||
Create the directory if it doesn't exist. If the file already exists, read it
|
||||
first, collect the set of qids already audited, and SKIP those — this lets
|
||||
the run resume after an interruption.
|
||||
|
||||
WORK PLAN
|
||||
1. Read the list of published YAML files from `mobile_published.txt`. Track them
|
||||
in lexical order (sorted by track, then area, then qid).
|
||||
2. For each unaudited published file:
|
||||
a. Read the YAML. Extract: id, track, level, zone, topic, competency_area,
|
||||
title, scenario, question (if present), and the entire `details` block
|
||||
(realistic_solution, common_mistake, napkin_math, options, correct_index).
|
||||
b. Run the five gates below.
|
||||
c. Append a single JSON record to the output file (with a trailing newline).
|
||||
3. Every 25 questions, print a one-line progress update to stdout:
|
||||
`progress: <N>/<TOTAL> · pass=<P> fail=<F> · current=<qid>`.
|
||||
4. When done, print a summary block: per-gate pass/fail counts, per-track
|
||||
totals, top 10 failure rationales by frequency.
|
||||
|
||||
THE FIVE GATES
|
||||
|
||||
Gate A — format_compliance
|
||||
common_mistake (when non-empty) must contain in order:
|
||||
"**The Pitfall:**" "**The Rationale:**" "**The Consequence:**"
|
||||
napkin_math (when non-empty) must contain in order:
|
||||
"**Assumptions" (or "**Assumptions & Constraints:**")
|
||||
"**Calculations:**"
|
||||
"**Conclusion" (or "**Conclusion & Interpretation:**")
|
||||
Verdict: pass | fail · with `format_issues: [<missing markers>]` on fail.
|
||||
|
||||
Gate B — level_fit
|
||||
The `level` field claims a Bloom-mapped depth (L1=Remember .. L6+=Create
|
||||
Staff-level). Read the question + scenario + realistic_solution and judge
|
||||
whether the claimed level matches what the question actually demands.
|
||||
Verdict: pass | fail
|
||||
On fail: `level_fit_rationale` (1-2 sentences), `suggested_level` (e.g. "L3").
|
||||
|
||||
Gate C — coherence
|
||||
Reject (verdict=fail) on any of:
|
||||
1. PHYSICAL ABSURDITY: hardware/software numbers violate real-world
|
||||
bounds (e.g., NPU wake-up >50ms, smartphone pulling 50W, latency
|
||||
>5× off realistic for the named hardware).
|
||||
2. VENDOR-NAME FABRICATION: hardware/framework/benchmark names that
|
||||
don't exist or are misattributed (e.g., "Coral Edge TPU XL" — no XL
|
||||
variant). Treat ambiguous-but-plausible as ok; flag clearly invented.
|
||||
3. SCENARIO/QUESTION/SOLUTION MISMATCH: question doesn't follow from
|
||||
scenario, realistic_solution doesn't actually answer the question,
|
||||
or numbers contradict across fields.
|
||||
4. ARITHMETIC IN SCENARIO: scenario contains a stated calculation that
|
||||
is wrong on its face (this is separate from gate D's napkin math).
|
||||
Verdict: pass | fail · `coherence_failure_mode` (one of: physical-absurdity,
|
||||
vendor-fabrication, mismatch, scenario-arithmetic, none) · `coherence_rationale`.
|
||||
|
||||
Gate D — math_correct
|
||||
Independently re-derive the napkin_math calculations. Are the assumptions
|
||||
sound? Do the unit conversions check out? Does the conclusion follow?
|
||||
Verdict: pass | fail · `math_errors: [<short error list>]` on fail.
|
||||
|
||||
Gate E — title_quality
|
||||
Title (≤120 chars, plaintext, no LaTeX, no markdown, no underscores).
|
||||
Verdicts:
|
||||
good — descriptive, concrete, names the operative concept
|
||||
generic — too vague to retrieve ("Cloud Q1", "Memory Question")
|
||||
placeholder — clearly an unfilled placeholder ("TODO", "draft", "x")
|
||||
On non-good: `title_suggestion` if you can produce a short concrete one.
|
||||
|
||||
OUTPUT JSON SHAPE (one per line in `01_audit.jsonl`)
|
||||
|
||||
{
|
||||
"qid": "cloud-2297",
|
||||
"track": "cloud",
|
||||
"format_compliance": "pass" | "fail",
|
||||
"format_issues": [],
|
||||
"level_fit": "pass" | "fail",
|
||||
"level_fit_rationale": "...",
|
||||
"suggested_level": "L4" | null,
|
||||
"coherence": "pass" | "fail",
|
||||
"coherence_failure_mode": "none" | "physical-absurdity" | ...,
|
||||
"coherence_rationale": "...",
|
||||
"math_correct": "pass" | "fail",
|
||||
"math_errors": [],
|
||||
"title_quality": "good" | "generic" | "placeholder",
|
||||
"title_suggestion": null
|
||||
}
|
||||
|
||||
CRITICAL RULES
|
||||
|
||||
- Append only. Do not rewrite the file. Each batch you complete should be
|
||||
durable on disk so a kill-9 mid-run loses at most one item.
|
||||
- Do not modify any YAML. This is read-only audit; corrections are a
|
||||
downstream task.
|
||||
- Skip non-published statuses. Do not audit drafts, flagged, deleted,
|
||||
or archived.
|
||||
- Process at least 200 items per session. Print progress every 25.
|
||||
- If you encounter a YAML you can't parse, write a record with
|
||||
`qid: "<filename-stem>"` and all gates `error`, plus `_reason: "..."`.
|
||||
- If you hit a tool / network error, write what you have so far, then
|
||||
print `STOPPING: <reason>` and exit cleanly. Do not crash.
|
||||
|
||||
START NOW. First action: read the existing
|
||||
`audit_results.jsonl` (or note
|
||||
that it doesn't exist), then list published YAMLs.
|
||||
@@ -0,0 +1,107 @@
|
||||
You are auditing the StaffML ML-systems interview corpus. Each item is a YAML
|
||||
file under `interviews/vault/questions/<track>/<area>/<id>.yaml`. Audit only
|
||||
files where `status: published`.
|
||||
|
||||
OUTPUT TARGET (write here, append, one JSON object per line):
|
||||
`audit_results.jsonl`
|
||||
Create the directory if it doesn't exist. If the file already exists, read it
|
||||
first, collect the set of qids already audited, and SKIP those — this lets
|
||||
the run resume after an interruption.
|
||||
|
||||
WORK PLAN
|
||||
1. Read the list of published YAML files from `tinyml_published.txt`. Track them
|
||||
in lexical order (sorted by track, then area, then qid).
|
||||
2. For each unaudited published file:
|
||||
a. Read the YAML. Extract: id, track, level, zone, topic, competency_area,
|
||||
title, scenario, question (if present), and the entire `details` block
|
||||
(realistic_solution, common_mistake, napkin_math, options, correct_index).
|
||||
b. Run the five gates below.
|
||||
c. Append a single JSON record to the output file (with a trailing newline).
|
||||
3. Every 25 questions, print a one-line progress update to stdout:
|
||||
`progress: <N>/<TOTAL> · pass=<P> fail=<F> · current=<qid>`.
|
||||
4. When done, print a summary block: per-gate pass/fail counts, per-track
|
||||
totals, top 10 failure rationales by frequency.
|
||||
|
||||
THE FIVE GATES
|
||||
|
||||
Gate A — format_compliance
|
||||
common_mistake (when non-empty) must contain in order:
|
||||
"**The Pitfall:**" "**The Rationale:**" "**The Consequence:**"
|
||||
napkin_math (when non-empty) must contain in order:
|
||||
"**Assumptions" (or "**Assumptions & Constraints:**")
|
||||
"**Calculations:**"
|
||||
"**Conclusion" (or "**Conclusion & Interpretation:**")
|
||||
Verdict: pass | fail · with `format_issues: [<missing markers>]` on fail.
|
||||
|
||||
Gate B — level_fit
|
||||
The `level` field claims a Bloom-mapped depth (L1=Remember .. L6+=Create
|
||||
Staff-level). Read the question + scenario + realistic_solution and judge
|
||||
whether the claimed level matches what the question actually demands.
|
||||
Verdict: pass | fail
|
||||
On fail: `level_fit_rationale` (1-2 sentences), `suggested_level` (e.g. "L3").
|
||||
|
||||
Gate C — coherence
|
||||
Reject (verdict=fail) on any of:
|
||||
1. PHYSICAL ABSURDITY: hardware/software numbers violate real-world
|
||||
bounds (e.g., NPU wake-up >50ms, smartphone pulling 50W, latency
|
||||
>5× off realistic for the named hardware).
|
||||
2. VENDOR-NAME FABRICATION: hardware/framework/benchmark names that
|
||||
don't exist or are misattributed (e.g., "Coral Edge TPU XL" — no XL
|
||||
variant). Treat ambiguous-but-plausible as ok; flag clearly invented.
|
||||
3. SCENARIO/QUESTION/SOLUTION MISMATCH: question doesn't follow from
|
||||
scenario, realistic_solution doesn't actually answer the question,
|
||||
or numbers contradict across fields.
|
||||
4. ARITHMETIC IN SCENARIO: scenario contains a stated calculation that
|
||||
is wrong on its face (this is separate from gate D's napkin math).
|
||||
Verdict: pass | fail · `coherence_failure_mode` (one of: physical-absurdity,
|
||||
vendor-fabrication, mismatch, scenario-arithmetic, none) · `coherence_rationale`.
|
||||
|
||||
Gate D — math_correct
|
||||
Independently re-derive the napkin_math calculations. Are the assumptions
|
||||
sound? Do the unit conversions check out? Does the conclusion follow?
|
||||
Verdict: pass | fail · `math_errors: [<short error list>]` on fail.
|
||||
|
||||
Gate E — title_quality
|
||||
Title (≤120 chars, plaintext, no LaTeX, no markdown, no underscores).
|
||||
Verdicts:
|
||||
good — descriptive, concrete, names the operative concept
|
||||
generic — too vague to retrieve ("Cloud Q1", "Memory Question")
|
||||
placeholder — clearly an unfilled placeholder ("TODO", "draft", "x")
|
||||
On non-good: `title_suggestion` if you can produce a short concrete one.
|
||||
|
||||
OUTPUT JSON SHAPE (one per line in `01_audit.jsonl`)
|
||||
|
||||
{
|
||||
"qid": "cloud-2297",
|
||||
"track": "cloud",
|
||||
"format_compliance": "pass" | "fail",
|
||||
"format_issues": [],
|
||||
"level_fit": "pass" | "fail",
|
||||
"level_fit_rationale": "...",
|
||||
"suggested_level": "L4" | null,
|
||||
"coherence": "pass" | "fail",
|
||||
"coherence_failure_mode": "none" | "physical-absurdity" | ...,
|
||||
"coherence_rationale": "...",
|
||||
"math_correct": "pass" | "fail",
|
||||
"math_errors": [],
|
||||
"title_quality": "good" | "generic" | "placeholder",
|
||||
"title_suggestion": null
|
||||
}
|
||||
|
||||
CRITICAL RULES
|
||||
|
||||
- Append only. Do not rewrite the file. Each batch you complete should be
|
||||
durable on disk so a kill-9 mid-run loses at most one item.
|
||||
- Do not modify any YAML. This is read-only audit; corrections are a
|
||||
downstream task.
|
||||
- Skip non-published statuses. Do not audit drafts, flagged, deleted,
|
||||
or archived.
|
||||
- Process at least 200 items per session. Print progress every 25.
|
||||
- If you encounter a YAML you can't parse, write a record with
|
||||
`qid: "<filename-stem>"` and all gates `error`, plus `_reason: "..."`.
|
||||
- If you hit a tool / network error, write what you have so far, then
|
||||
print `STOPPING: <reason>` and exit cleanly. Do not crash.
|
||||
|
||||
START NOW. First action: read the existing
|
||||
`audit_results.jsonl` (or note
|
||||
that it doesn't exist), then list published YAMLs.
|
||||
1826
mobile_published.txt
Normal file
1826
mobile_published.txt
Normal file
File diff suppressed because it is too large
Load Diff
10
run_audit.sh
Executable file
10
run_audit.sh
Executable file
@@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
TRACK=$1
|
||||
MODEL=$2
|
||||
if [ -z "$MODEL" ]; then
|
||||
cd MLSysBook-yaml-audit
|
||||
gemini --yolo --skip-trust -p "$(cat interviews/vault/_pipeline/runs/gemini-self-audit/prompts/${TRACK}_audit_prompt.md)" < /dev/null
|
||||
else
|
||||
cd MLSysBook-yaml-audit
|
||||
gemini -m "$MODEL" --yolo --skip-trust -p "$(cat interviews/vault/_pipeline/runs/gemini-self-audit/prompts/${TRACK}_audit_prompt.md)" < /dev/null
|
||||
fi
|
||||
1208
tinyml_published.txt
Normal file
1208
tinyml_published.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user