mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 10:08:50 -05:00
529 lines
16 KiB
JavaScript
529 lines
16 KiB
JavaScript
// textExtractor.js
|
|
import { TEXT_EXTRACTION_CONFIG } from '../../configs/client.config.js';
|
|
|
|
/**
|
|
* Estimates the number of tokens in a text string
|
|
* Rough approximation: 1 token ≈ 4 characters for English text
|
|
* @param {string} text - The text to estimate tokens for
|
|
* @returns {number} - Estimated token count
|
|
*/
|
|
function estimateTokens(text) {
|
|
if (!text) return 0;
|
|
// Remove extra whitespace and count characters
|
|
const cleanText = text.replace(/\s+/g, ' ').trim();
|
|
return Math.ceil(cleanText.length / 4);
|
|
}
|
|
|
|
/**
|
|
* Checks if an element should be excluded from text extraction
|
|
* @param {Element} element - The DOM element to check
|
|
* @returns {boolean} - True if element should be excluded
|
|
*/
|
|
function shouldExcludeElement(element) {
|
|
// Check if element matches any navigation selectors
|
|
const navSelectors = TEXT_EXTRACTION_CONFIG.NAV_SELECTORS;
|
|
|
|
for (const selector of navSelectors) {
|
|
if (element.matches(selector)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Check if element is hidden
|
|
const style = window.getComputedStyle(element);
|
|
if (style.display === 'none' || style.visibility === 'hidden') {
|
|
return true;
|
|
}
|
|
|
|
// Check for common navigation attributes
|
|
const role = element.getAttribute('role');
|
|
if (role && ['navigation', 'banner', 'contentinfo'].includes(role)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Extracts text content from a DOM element, excluding navigation components
|
|
* @param {Element} rootElement - The root element to extract text from (defaults to document.body)
|
|
* @returns {string} - Extracted text content
|
|
*/
|
|
function extractTextContent(rootElement = document.body) {
|
|
if (!rootElement) {
|
|
console.warn('No root element provided for text extraction');
|
|
return '';
|
|
}
|
|
|
|
// Clone the element to avoid modifying the original DOM
|
|
const clonedElement = rootElement.cloneNode(true);
|
|
|
|
// Remove navigation elements from the clone
|
|
TEXT_EXTRACTION_CONFIG.NAV_SELECTORS.forEach(selector => {
|
|
const navElements = clonedElement.querySelectorAll(selector);
|
|
navElements.forEach(el => el.remove());
|
|
});
|
|
|
|
// Remove elements with navigation roles
|
|
const navRoleElements = clonedElement.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"]');
|
|
navRoleElements.forEach(el => el.remove());
|
|
|
|
// Remove script and style elements
|
|
const scriptElements = clonedElement.querySelectorAll('script, style');
|
|
scriptElements.forEach(el => el.remove());
|
|
|
|
// Get text content
|
|
let textContent = clonedElement.textContent || '';
|
|
|
|
// Clean up the text
|
|
textContent = textContent
|
|
.replace(/\s+/g, ' ') // Replace multiple whitespace with single space
|
|
.replace(/\n\s*\n/g, '\n') // Remove empty lines
|
|
.trim();
|
|
|
|
return textContent;
|
|
}
|
|
|
|
/**
|
|
* Extracts text content with source mapping for reference tracking
|
|
* @param {Element} rootElement - The root element to extract text from (defaults to document.body)
|
|
* @returns {Object} - Object containing text content and source mapping
|
|
*/
|
|
function extractTextContentWithSources(rootElement = document.body) {
|
|
if (!rootElement) {
|
|
console.warn('No root element provided for text extraction');
|
|
return { text: '', sources: [] };
|
|
}
|
|
|
|
// Clone the element to avoid modifying the original DOM
|
|
const clonedElement = rootElement.cloneNode(true);
|
|
|
|
// Remove navigation elements from the clone
|
|
TEXT_EXTRACTION_CONFIG.NAV_SELECTORS.forEach(selector => {
|
|
const navElements = clonedElement.querySelectorAll(selector);
|
|
navElements.forEach(el => el.remove());
|
|
});
|
|
|
|
// Remove elements with navigation roles
|
|
const navRoleElements = clonedElement.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"]');
|
|
navRoleElements.forEach(el => el.remove());
|
|
|
|
// Remove script and style elements
|
|
const scriptElements = clonedElement.querySelectorAll('script, style');
|
|
scriptElements.forEach(el => el.remove());
|
|
|
|
// Find content sections and extract with source mapping
|
|
const sources = [];
|
|
const contentSections = [];
|
|
|
|
// Look for main content areas first
|
|
const mainContentSelectors = TEXT_EXTRACTION_CONFIG.CONTENT_SELECTORS;
|
|
let contentElements = [];
|
|
|
|
for (const selector of mainContentSelectors) {
|
|
const elements = clonedElement.querySelectorAll(selector);
|
|
if (elements.length > 0) {
|
|
contentElements = Array.from(elements);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If no main content found, use the entire cloned element
|
|
if (contentElements.length === 0) {
|
|
contentElements = [clonedElement];
|
|
}
|
|
|
|
// Process each content element
|
|
contentElements.forEach((element, index) => {
|
|
const elementId = element.id || `content-section-${index}`;
|
|
const elementClass = element.className || '';
|
|
const elementTag = element.tagName.toLowerCase();
|
|
|
|
// Get text content from this element
|
|
let textContent = element.textContent || '';
|
|
|
|
// Clean up the text
|
|
textContent = textContent
|
|
.replace(/\s+/g, ' ')
|
|
.replace(/\n\s*\n/g, '\n')
|
|
.trim();
|
|
|
|
if (textContent.length > 50) { // Only include substantial content
|
|
// Create source mapping
|
|
const source = {
|
|
sourceId: `source-${index}`,
|
|
label: elementId || `${elementTag}-${index}`,
|
|
content: textContent,
|
|
pageUrl: window.location.href,
|
|
domain: window.location.hostname,
|
|
level: 'page',
|
|
position: index,
|
|
elementId: elementId,
|
|
elementClass: elementClass,
|
|
elementTag: elementTag
|
|
};
|
|
|
|
sources.push(source);
|
|
contentSections.push(`## ${source.label}\n\n${source.content}`);
|
|
}
|
|
});
|
|
|
|
// Combine all content sections
|
|
const combinedText = contentSections.join('\n\n---\n\n');
|
|
|
|
return {
|
|
text: combinedText,
|
|
sources: sources
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Samples text content to fit within token limits using random distributed sampling
|
|
* @param {string} text - The full text content
|
|
* @param {number} maxTokens - Maximum number of tokens allowed
|
|
* @returns {string} - Sampled text content
|
|
*/
|
|
function sampleTextContent(text, maxTokens = TEXT_EXTRACTION_CONFIG.MAX_TOKENS) {
|
|
const currentTokens = estimateTokens(text);
|
|
|
|
if (currentTokens <= maxTokens) {
|
|
return text;
|
|
}
|
|
|
|
|
|
// Split text into sentences for better sampling
|
|
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
|
|
|
if (sentences.length === 0) {
|
|
const samplingRatio = maxTokens / currentTokens;
|
|
return text.substring(0, Math.floor(text.length * samplingRatio));
|
|
}
|
|
|
|
// Calculate how many sentences we can fit
|
|
const avgTokensPerSentence = currentTokens / sentences.length;
|
|
const targetSentenceCount = Math.floor(maxTokens / avgTokensPerSentence);
|
|
|
|
// Ensure we don't try to sample more sentences than exist
|
|
const actualTargetCount = Math.min(targetSentenceCount, sentences.length);
|
|
|
|
// Random distributed sampling strategy
|
|
const sampledSentences = randomDistributedSampling(sentences, actualTargetCount, maxTokens);
|
|
|
|
return sampledSentences.join('. ') + '.';
|
|
}
|
|
|
|
/**
|
|
* Performs random distributed sampling across the entire text
|
|
* @param {Array} sentences - Array of sentences
|
|
* @param {number} targetCount - Target number of sentences to sample
|
|
* @param {number} maxTokens - Maximum tokens allowed
|
|
* @returns {Array} - Array of sampled sentences
|
|
*/
|
|
function randomDistributedSampling(sentences, targetCount, maxTokens) {
|
|
if (sentences.length <= targetCount) {
|
|
return sentences;
|
|
}
|
|
|
|
const sampledSentences = [];
|
|
let sampledTokens = 0;
|
|
|
|
// Create indices for the entire sentence array
|
|
const allIndices = Array.from({ length: sentences.length }, (_, i) => i);
|
|
|
|
// Shuffle indices randomly
|
|
const shuffledIndices = shuffleArray([...allIndices]);
|
|
|
|
// Sample from shuffled indices
|
|
for (const index of shuffledIndices) {
|
|
const sentence = sentences[index].trim();
|
|
const sentenceTokens = estimateTokens(sentence);
|
|
|
|
if (sampledTokens + sentenceTokens <= maxTokens && sampledSentences.length < targetCount) {
|
|
sampledSentences.push(sentence);
|
|
sampledTokens += sentenceTokens;
|
|
}
|
|
|
|
// Stop if we've reached our target or token limit
|
|
if (sampledSentences.length >= targetCount || sampledTokens >= maxTokens * 0.95) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If we still have room and haven't sampled enough, try to add more
|
|
if (sampledSentences.length < targetCount && sampledTokens < maxTokens * 0.9) {
|
|
const remainingIndices = shuffledIndices.filter(i => !sampledSentences.includes(sentences[i]));
|
|
|
|
for (const index of remainingIndices) {
|
|
const sentence = sentences[index].trim();
|
|
const sentenceTokens = estimateTokens(sentence);
|
|
|
|
if (sampledTokens + sentenceTokens <= maxTokens) {
|
|
sampledSentences.push(sentence);
|
|
sampledTokens += sentenceTokens;
|
|
}
|
|
|
|
if (sampledTokens >= maxTokens * 0.95) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return sampledSentences;
|
|
}
|
|
|
|
/**
|
|
* Shuffles an array using Fisher-Yates algorithm
|
|
* @param {Array} array - Array to shuffle
|
|
* @returns {Array} - Shuffled array
|
|
*/
|
|
function shuffleArray(array) {
|
|
const shuffled = [...array];
|
|
for (let i = shuffled.length - 1; i > 0; i--) {
|
|
const j = Math.floor(Math.random() * (i + 1));
|
|
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
|
|
}
|
|
return shuffled;
|
|
}
|
|
|
|
/**
|
|
* Main function to extract page text content for quiz generation
|
|
* @param {Element} rootElement - Optional root element to extract from
|
|
* @returns {Object} - Object containing extracted text and metadata
|
|
*/
|
|
export function extractPageTextForQuiz(rootElement = document.body) {
|
|
try {
|
|
|
|
// Extract text content
|
|
const fullText = extractTextContent(rootElement);
|
|
|
|
if (!fullText) {
|
|
console.warn('No text content found on the page');
|
|
return {
|
|
text: '',
|
|
tokens: 0,
|
|
sampled: false,
|
|
originalTokens: 0
|
|
};
|
|
}
|
|
|
|
// Estimate tokens
|
|
const originalTokens = estimateTokens(fullText);
|
|
|
|
// Sample if necessary
|
|
const finalText = sampleTextContent(fullText, TEXT_EXTRACTION_CONFIG.MAX_TOKENS);
|
|
const finalTokens = estimateTokens(finalText);
|
|
const wasSampled = finalTokens < originalTokens;
|
|
|
|
console.log(`Text extraction complete:`, {
|
|
originalTokens,
|
|
finalTokens,
|
|
sampled: wasSampled,
|
|
textLength: finalText.length
|
|
});
|
|
|
|
return {
|
|
text: finalText,
|
|
tokens: finalTokens,
|
|
sampled: wasSampled,
|
|
originalTokens: originalTokens
|
|
};
|
|
|
|
} catch (error) {
|
|
console.error('Error extracting page text:', error);
|
|
return {
|
|
text: '',
|
|
tokens: 0,
|
|
sampled: false,
|
|
originalTokens: 0,
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main function to extract page text content with source mapping for cumulative quiz generation
|
|
* @param {Element} rootElement - Optional root element to extract from
|
|
* @returns {Object} - Object containing extracted text, sources, and metadata
|
|
*/
|
|
export function extractPageTextWithSourcesForQuiz(rootElement = document.body) {
|
|
try {
|
|
|
|
// Extract text content with source mapping
|
|
const extractionResult = extractTextContentWithSources(rootElement);
|
|
|
|
if (!extractionResult.text || extractionResult.sources.length === 0) {
|
|
console.warn('No text content or sources found on the page');
|
|
return {
|
|
text: '',
|
|
sources: [],
|
|
tokens: 0,
|
|
sampled: false,
|
|
originalTokens: 0
|
|
};
|
|
}
|
|
|
|
// Estimate tokens
|
|
const originalTokens = estimateTokens(extractionResult.text);
|
|
|
|
// Use source-aware sampling for better distribution
|
|
const samplingResult = sampleTextContentWithSources(extractionResult.sources, TEXT_EXTRACTION_CONFIG.MAX_TOKENS);
|
|
const finalText = samplingResult.text;
|
|
const finalTokens = samplingResult.tokens;
|
|
const wasSampled = finalTokens < originalTokens;
|
|
|
|
console.log(`Text extraction with sources complete:`, {
|
|
originalTokens,
|
|
finalTokens,
|
|
sampled: wasSampled,
|
|
textLength: finalText.length,
|
|
sourceCount: extractionResult.sources.length,
|
|
sampledSourceCount: samplingResult.sampledSources.length
|
|
});
|
|
|
|
return {
|
|
text: finalText,
|
|
sources: samplingResult.sampledSources,
|
|
tokens: finalTokens,
|
|
sampled: wasSampled,
|
|
originalTokens: originalTokens
|
|
};
|
|
|
|
} catch (error) {
|
|
console.error('Error extracting page text with sources:', error);
|
|
return {
|
|
text: '',
|
|
sources: [],
|
|
tokens: 0,
|
|
sampled: false,
|
|
originalTokens: 0,
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Samples text content from multiple sources ensuring distribution across sources
|
|
* @param {Array} sources - Array of source objects with content
|
|
* @param {number} maxTokens - Maximum tokens allowed
|
|
* @returns {Object} - Object containing sampled text and sources
|
|
*/
|
|
function sampleTextContentWithSources(sources, maxTokens = TEXT_EXTRACTION_CONFIG.MAX_TOKENS) {
|
|
if (!sources || sources.length === 0) {
|
|
return { text: '', tokens: 0, sampledSources: [] };
|
|
}
|
|
|
|
// Calculate total tokens across all sources
|
|
const totalTokens = sources.reduce((sum, source) => sum + estimateTokens(source.content), 0);
|
|
|
|
if (totalTokens <= maxTokens) {
|
|
// No sampling needed
|
|
const combinedText = sources.map(source => `## ${source.label}\n\n${source.content}`).join('\n\n---\n\n');
|
|
return {
|
|
text: combinedText,
|
|
tokens: totalTokens,
|
|
sampledSources: sources
|
|
};
|
|
}
|
|
|
|
|
|
// Calculate tokens per source and determine sampling strategy
|
|
const sourceTokens = sources.map(source => ({
|
|
...source,
|
|
tokens: estimateTokens(source.content)
|
|
}));
|
|
|
|
// Sort sources by token count (largest first) for better distribution
|
|
sourceTokens.sort((a, b) => b.tokens - a.tokens);
|
|
|
|
const sampledSources = [];
|
|
let sampledTokens = 0;
|
|
|
|
// Distribute sampling across sources
|
|
const tokensPerSource = Math.floor(maxTokens / sources.length);
|
|
const minTokensPerSource = Math.floor(tokensPerSource * 0.5); // Ensure minimum representation
|
|
|
|
for (const source of sourceTokens) {
|
|
const targetTokens = Math.min(source.tokens, tokensPerSource);
|
|
|
|
if (sampledTokens + targetTokens <= maxTokens) {
|
|
// Sample content from this source
|
|
const sampledContent = sampleTextContent(source.content, targetTokens);
|
|
const actualTokens = estimateTokens(sampledContent);
|
|
|
|
if (sampledTokens + actualTokens <= maxTokens) {
|
|
sampledSources.push({
|
|
...source,
|
|
content: sampledContent,
|
|
tokens: actualTokens
|
|
});
|
|
sampledTokens += actualTokens;
|
|
}
|
|
}
|
|
|
|
// Stop if we're close to the limit
|
|
if (sampledTokens >= maxTokens * 0.95) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If we still have room, try to add more content from sources that weren't fully sampled
|
|
if (sampledTokens < maxTokens * 0.8) {
|
|
const remainingSources = sourceTokens.filter(source =>
|
|
!sampledSources.some(sampled => sampled.sourceId === source.sourceId)
|
|
);
|
|
|
|
for (const source of remainingSources) {
|
|
const remainingTokens = maxTokens - sampledTokens;
|
|
if (remainingTokens > minTokensPerSource) {
|
|
const sampledContent = sampleTextContent(source.content, remainingTokens);
|
|
const actualTokens = estimateTokens(sampledContent);
|
|
|
|
if (sampledTokens + actualTokens <= maxTokens) {
|
|
sampledSources.push({
|
|
...source,
|
|
content: sampledContent,
|
|
tokens: actualTokens
|
|
});
|
|
sampledTokens += actualTokens;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Combine sampled content
|
|
const combinedText = sampledSources.map(source => `## ${source.label}\n\n${source.content}`).join('\n\n---\n\n');
|
|
|
|
|
|
return {
|
|
text: combinedText,
|
|
tokens: sampledTokens,
|
|
sampledSources: sampledSources
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Logs text extraction results for debugging
|
|
* @param {Object} extractionResult - Result from extractPageTextForQuiz or extractPageTextWithSourcesForQuiz
|
|
*/
|
|
export function logTextExtraction(extractionResult) {
|
|
// Log extraction results only if there's an error
|
|
if (extractionResult.error) {
|
|
console.error('Extraction Error:', extractionResult.error);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets the current page URL for reference
|
|
* @returns {string} - Current page URL
|
|
*/
|
|
export function getCurrentPageUrl() {
|
|
return window.location.href;
|
|
}
|
|
|
|
/**
|
|
* Gets the current page title for reference
|
|
* @returns {string} - Current page title
|
|
*/
|
|
export function getCurrentPageTitle() {
|
|
return document.title || 'Untitled Page';
|
|
} |