mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-05 17:18:48 -05:00
Remove redundant ml_ prefix from ml_workflow chapter files and update all Quarto config references. Consolidate custom scripts into native binder subcommands and archive obsolete tooling.
259 lines
9.1 KiB
Python
259 lines
9.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Smart Glossary Consolidation using LLM-based similarity detection.
|
|
|
|
This script automates the process of finding and consolidating similar terms
|
|
in the glossary using intelligent similarity detection and LLM-based decisions.
|
|
|
|
Workflow:
|
|
1. Detect similar terms using multiple similarity metrics
|
|
2. Group potential duplicates for LLM review
|
|
3. Use LLM to decide which terms to merge and how
|
|
4. Apply consolidation decisions automatically
|
|
5. Generate clean master glossary
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple, Set
|
|
from difflib import SequenceMatcher
|
|
from collections import defaultdict
|
|
|
|
def calculate_similarity(term1: str, term2: str) -> float:
|
|
"""Calculate similarity between two terms using multiple metrics."""
|
|
# Normalize terms for comparison
|
|
norm1 = normalize_for_comparison(term1)
|
|
norm2 = normalize_for_comparison(term2)
|
|
|
|
# Exact match after normalization
|
|
if norm1 == norm2:
|
|
return 1.0
|
|
|
|
# Sequence similarity
|
|
seq_sim = SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
# Check for subset relationships
|
|
if norm1 in norm2 or norm2 in norm1:
|
|
return 0.9
|
|
|
|
# Check for word overlap
|
|
words1 = set(norm1.split())
|
|
words2 = set(norm2.split())
|
|
if words1 and words2:
|
|
word_overlap = len(words1 & words2) / len(words1 | words2)
|
|
if word_overlap > 0.5:
|
|
return max(seq_sim, word_overlap)
|
|
|
|
return seq_sim
|
|
|
|
def normalize_for_comparison(term: str) -> str:
|
|
"""Normalize term for similarity comparison."""
|
|
normalized = term.lower()
|
|
# Remove common variations
|
|
normalized = re.sub(r'\s*\([^)]*\)', '', normalized) # Remove parentheses
|
|
normalized = re.sub(r'[^\w\s]', '', normalized) # Remove punctuation
|
|
normalized = re.sub(r'\s+', ' ', normalized).strip() # Normalize whitespace
|
|
# Handle plurals
|
|
if normalized.endswith('s') and not normalized.endswith('ss'):
|
|
singular = normalized[:-1]
|
|
return singular
|
|
return normalized
|
|
|
|
def find_similar_terms(terms: List[Dict]) -> List[List[Dict]]:
|
|
"""Find groups of similar terms that might need consolidation."""
|
|
similarity_threshold = 0.7
|
|
groups = []
|
|
processed = set()
|
|
|
|
for i, term1 in enumerate(terms):
|
|
if i in processed:
|
|
continue
|
|
|
|
current_group = [term1]
|
|
processed.add(i)
|
|
|
|
for j, term2 in enumerate(terms[i+1:], i+1):
|
|
if j in processed:
|
|
continue
|
|
|
|
similarity = calculate_similarity(term1['term'], term2['term'])
|
|
if similarity >= similarity_threshold:
|
|
current_group.append(term2)
|
|
processed.add(j)
|
|
|
|
# Only include groups with multiple terms
|
|
if len(current_group) > 1:
|
|
groups.append(current_group)
|
|
|
|
return groups
|
|
|
|
def generate_consolidation_prompt(term_group: List[Dict]) -> str:
|
|
"""Generate a prompt for LLM to decide on term consolidation."""
|
|
terms_info = []
|
|
for term in term_group:
|
|
info = f"- '{term['term']}': {term['definition'][:100]}..."
|
|
if term.get('chapter_source'):
|
|
info += f" (from {term['chapter_source']})"
|
|
terms_info.append(info)
|
|
|
|
prompt = f"""I have found these potentially similar glossary terms that might need consolidation:
|
|
|
|
{chr(10).join(terms_info)}
|
|
|
|
Please analyze these terms and provide a JSON response with your consolidation decision:
|
|
|
|
{{
|
|
"action": "merge|keep_separate",
|
|
"reasoning": "brief explanation of your decision",
|
|
"preferred_term": "the term name to keep if merging",
|
|
"preferred_definition": "the best definition if merging",
|
|
"appears_in": ["list", "of", "chapters", "if", "merging"]
|
|
}}
|
|
|
|
Guidelines:
|
|
- MERGE if terms refer to the same concept (e.g., "adversarial example" vs "adversarial examples")
|
|
- MERGE if one term is a clear subset/superset of another
|
|
- KEEP_SEPARATE if terms have meaningfully different definitions or contexts
|
|
- For merged terms, prefer the most comprehensive definition
|
|
- Use singular form for merged terms unless plural is more standard
|
|
- Include all source chapters in appears_in for merged terms
|
|
|
|
Respond with only the JSON, no other text."""
|
|
|
|
return prompt
|
|
|
|
def apply_consolidation_decisions(decisions: List[Dict], original_terms: List[Dict]) -> List[Dict]:
|
|
"""Apply LLM consolidation decisions to the original terms."""
|
|
consolidated_terms = []
|
|
terms_to_remove = set()
|
|
|
|
# Process merge decisions
|
|
for decision in decisions:
|
|
if decision['action'] == 'merge':
|
|
# Find all terms in this merge group
|
|
group_terms = decision.get('original_terms', [])
|
|
for term in group_terms:
|
|
terms_to_remove.add(term['term'])
|
|
|
|
# Add the merged term
|
|
merged_term = {
|
|
'term': decision['preferred_term'],
|
|
'definition': decision['preferred_definition'],
|
|
'appears_in': decision['appears_in'],
|
|
'chapter_source': decision['appears_in'][0] if decision['appears_in'] else '',
|
|
'aliases': [],
|
|
'see_also': []
|
|
}
|
|
consolidated_terms.append(merged_term)
|
|
|
|
# Add remaining terms that weren't merged
|
|
for term in original_terms:
|
|
if term['term'] not in terms_to_remove:
|
|
consolidated_terms.append(term)
|
|
|
|
return consolidated_terms
|
|
|
|
def process_with_claude(term_group: List[Dict]) -> Dict:
|
|
"""Process a term group using Claude API for consolidation decision."""
|
|
import anthropic
|
|
|
|
# You would need to set your API key
|
|
# client = anthropic.Anthropic(api_key="your-api-key")
|
|
|
|
prompt = generate_consolidation_prompt(term_group)
|
|
|
|
# For now, return a mock decision - you'd replace this with actual API call
|
|
# message = client.messages.create(
|
|
# model="claude-3-sonnet-20240229",
|
|
# max_tokens=1000,
|
|
# messages=[{"role": "user", "content": prompt}]
|
|
# )
|
|
#
|
|
# response = message.content[0].text
|
|
# return json.loads(response)
|
|
|
|
# Mock decision for demonstration
|
|
return {
|
|
"action": "keep_separate",
|
|
"reasoning": "Mock decision - would be replaced with actual Claude API call",
|
|
"preferred_term": term_group[0]['term'],
|
|
"preferred_definition": term_group[0]['definition'],
|
|
"appears_in": [term_group[0].get('chapter_source', '')]
|
|
}
|
|
|
|
def save_consolidation_log(decisions: List[Dict], output_path: Path):
|
|
"""Save consolidation decisions for review."""
|
|
log_data = {
|
|
'timestamp': 'generated_automatically',
|
|
'total_decisions': len(decisions),
|
|
'merge_count': len([d for d in decisions if d['action'] == 'merge']),
|
|
'keep_separate_count': len([d for d in decisions if d['action'] == 'keep_separate']),
|
|
'decisions': decisions
|
|
}
|
|
|
|
log_path = output_path.parent / 'consolidation_log.json'
|
|
with open(log_path, 'w') as f:
|
|
json.dump(log_data, f, indent=2)
|
|
|
|
print(f"📋 Consolidation log saved: {log_path}")
|
|
|
|
def main():
|
|
"""Main function for smart glossary consolidation."""
|
|
print("🔧 Smart Glossary Consolidation")
|
|
print("=" * 50)
|
|
|
|
# Load current master glossary
|
|
project_root = Path(__file__).parent.parent.parent.parent
|
|
master_path = project_root / "quarto/contents/backmatter/glossary/global_glossary.json"
|
|
|
|
print("📚 Loading current master glossary...")
|
|
with open(master_path, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
original_count = len(data['terms'])
|
|
print(f" → Found {original_count} terms")
|
|
|
|
# Find similar terms
|
|
print("🔍 Detecting similar terms...")
|
|
similar_groups = find_similar_terms(data['terms'])
|
|
|
|
if not similar_groups:
|
|
print("✅ No similar terms found that need consolidation!")
|
|
return
|
|
|
|
print(f"📊 Found {len(similar_groups)} groups of similar terms:")
|
|
for i, group in enumerate(similar_groups, 1):
|
|
terms = [t['term'] for t in group]
|
|
print(f" {i:2d}. {terms}")
|
|
|
|
print(f"\n🤖 This would require {len(similar_groups)} LLM calls to decide consolidations.")
|
|
print("📝 Each group would be analyzed for:")
|
|
print(" • Semantic similarity")
|
|
print(" • Definition overlap")
|
|
print(" • Context appropriateness")
|
|
print(" • Standard glossary practices")
|
|
|
|
print(f"\n🎯 Potential outcomes:")
|
|
print(f" • Merge similar terms (e.g., 'example' + 'examples' → 'example')")
|
|
print(f" • Keep distinct terms (e.g., 'training' vs 'training data')")
|
|
print(f" • Standardize definitions across chapters")
|
|
print(f" • Maintain chapter attribution")
|
|
|
|
# For demonstration, show what the first prompt would look like
|
|
if similar_groups:
|
|
print(f"\n📋 Example prompt for group 1:")
|
|
print("-" * 40)
|
|
prompt = generate_consolidation_prompt(similar_groups[0])
|
|
print(prompt[:500] + "..." if len(prompt) > 500 else prompt)
|
|
|
|
print(f"\n💡 To implement:")
|
|
print(f" 1. Add LLM API integration (OpenAI/Anthropic)")
|
|
print(f" 2. Process each group with LLM decision")
|
|
print(f" 3. Apply consolidation automatically")
|
|
print(f" 4. Regenerate master glossary and QMD file")
|
|
print(f" 5. Log all decisions for review")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|