From 645ef478a2dc17215302ea3c16218c0c767ec367 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Mon, 27 Oct 2025 13:03:36 -0400 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Add=20Shakespeare=20dataset=20to=20?= =?UTF-8?q?DatasetManager?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add get_shakespeare() method to download tiny-shakespeare.txt - Downloads from Karpathy's char-rnn repository (1MB corpus) - Returns raw text for character-level language modeling - Follows same pattern as MNIST/CIFAR-10 downloads - Includes test in main() function --- milestones/data_manager.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/milestones/data_manager.py b/milestones/data_manager.py index 8a28a5e9..c753e4ac 100644 --- a/milestones/data_manager.py +++ b/milestones/data_manager.py @@ -133,6 +133,29 @@ class DatasetManager: print(f"šŸ“Š CIFAR-10 loaded: {len(train_data)} training, {len(test_data)} test images") return (train_data, train_labels), (test_data, test_labels) + def get_shakespeare(self): + """Download and prepare Shakespeare text dataset for transformer milestone.""" + shakespeare_dir = self.data_dir / "shakespeare" + shakespeare_dir.mkdir(exist_ok=True) + + # Shakespeare text file + text_file = shakespeare_dir / "tiny-shakespeare.txt" + + if not text_file.exists(): + # Download from Karpathy's char-rnn repo + url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" + print("šŸ“„ Downloading tiny-shakespeare.txt...") + self.download_with_progress(url, text_file) + + # Load text + with open(text_file, 'r', encoding='utf-8') as f: + text = f.read() + + print(f"šŸ“Š Shakespeare loaded: {len(text):,} characters, {len(text.split()):,} words") + print(f" First 100 chars: {text[:100]!r}...") + + return text + def get_xor_data(self, num_samples=1000): """Generate XOR problem data for non-linear milestone.""" print("🧮 Generating XOR problem data...") @@ -217,6 +240,13 @@ def main(): except Exception as e: print(f" CIFAR-10 download failed: {e}") + print("\n5. Testing Shakespeare Text:") + try: + text = manager.get_shakespeare() + print(f" Length: {len(text):,} characters") + except Exception as e: + print(f" Shakespeare download failed: {e}") + print("\nāœ… Dataset Manager test complete!") if __name__ == "__main__":