Files
cs249r_book/socratiq/src_shadow/js/utils/textExtractor.js

529 lines
16 KiB
JavaScript

// textExtractor.js
import { TEXT_EXTRACTION_CONFIG } from '../../configs/client.config.js';
/**
* Estimates the number of tokens in a text string
* Rough approximation: 1 token ≈ 4 characters for English text
* @param {string} text - The text to estimate tokens for
* @returns {number} - Estimated token count
*/
function estimateTokens(text) {
if (!text) return 0;
// Remove extra whitespace and count characters
const cleanText = text.replace(/\s+/g, ' ').trim();
return Math.ceil(cleanText.length / 4);
}
/**
* Checks if an element should be excluded from text extraction
* @param {Element} element - The DOM element to check
* @returns {boolean} - True if element should be excluded
*/
function shouldExcludeElement(element) {
// Check if element matches any navigation selectors
const navSelectors = TEXT_EXTRACTION_CONFIG.NAV_SELECTORS;
for (const selector of navSelectors) {
if (element.matches(selector)) {
return true;
}
}
// Check if element is hidden
const style = window.getComputedStyle(element);
if (style.display === 'none' || style.visibility === 'hidden') {
return true;
}
// Check for common navigation attributes
const role = element.getAttribute('role');
if (role && ['navigation', 'banner', 'contentinfo'].includes(role)) {
return true;
}
return false;
}
/**
* Extracts text content from a DOM element, excluding navigation components
* @param {Element} rootElement - The root element to extract text from (defaults to document.body)
* @returns {string} - Extracted text content
*/
function extractTextContent(rootElement = document.body) {
if (!rootElement) {
console.warn('No root element provided for text extraction');
return '';
}
// Clone the element to avoid modifying the original DOM
const clonedElement = rootElement.cloneNode(true);
// Remove navigation elements from the clone
TEXT_EXTRACTION_CONFIG.NAV_SELECTORS.forEach(selector => {
const navElements = clonedElement.querySelectorAll(selector);
navElements.forEach(el => el.remove());
});
// Remove elements with navigation roles
const navRoleElements = clonedElement.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"]');
navRoleElements.forEach(el => el.remove());
// Remove script and style elements
const scriptElements = clonedElement.querySelectorAll('script, style');
scriptElements.forEach(el => el.remove());
// Get text content
let textContent = clonedElement.textContent || '';
// Clean up the text
textContent = textContent
.replace(/\s+/g, ' ') // Replace multiple whitespace with single space
.replace(/\n\s*\n/g, '\n') // Remove empty lines
.trim();
return textContent;
}
/**
* Extracts text content with source mapping for reference tracking
* @param {Element} rootElement - The root element to extract text from (defaults to document.body)
* @returns {Object} - Object containing text content and source mapping
*/
function extractTextContentWithSources(rootElement = document.body) {
if (!rootElement) {
console.warn('No root element provided for text extraction');
return { text: '', sources: [] };
}
// Clone the element to avoid modifying the original DOM
const clonedElement = rootElement.cloneNode(true);
// Remove navigation elements from the clone
TEXT_EXTRACTION_CONFIG.NAV_SELECTORS.forEach(selector => {
const navElements = clonedElement.querySelectorAll(selector);
navElements.forEach(el => el.remove());
});
// Remove elements with navigation roles
const navRoleElements = clonedElement.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"]');
navRoleElements.forEach(el => el.remove());
// Remove script and style elements
const scriptElements = clonedElement.querySelectorAll('script, style');
scriptElements.forEach(el => el.remove());
// Find content sections and extract with source mapping
const sources = [];
const contentSections = [];
// Look for main content areas first
const mainContentSelectors = TEXT_EXTRACTION_CONFIG.CONTENT_SELECTORS;
let contentElements = [];
for (const selector of mainContentSelectors) {
const elements = clonedElement.querySelectorAll(selector);
if (elements.length > 0) {
contentElements = Array.from(elements);
break;
}
}
// If no main content found, use the entire cloned element
if (contentElements.length === 0) {
contentElements = [clonedElement];
}
// Process each content element
contentElements.forEach((element, index) => {
const elementId = element.id || `content-section-${index}`;
const elementClass = element.className || '';
const elementTag = element.tagName.toLowerCase();
// Get text content from this element
let textContent = element.textContent || '';
// Clean up the text
textContent = textContent
.replace(/\s+/g, ' ')
.replace(/\n\s*\n/g, '\n')
.trim();
if (textContent.length > 50) { // Only include substantial content
// Create source mapping
const source = {
sourceId: `source-${index}`,
label: elementId || `${elementTag}-${index}`,
content: textContent,
pageUrl: window.location.href,
domain: window.location.hostname,
level: 'page',
position: index,
elementId: elementId,
elementClass: elementClass,
elementTag: elementTag
};
sources.push(source);
contentSections.push(`## ${source.label}\n\n${source.content}`);
}
});
// Combine all content sections
const combinedText = contentSections.join('\n\n---\n\n');
return {
text: combinedText,
sources: sources
};
}
/**
* Samples text content to fit within token limits using random distributed sampling
* @param {string} text - The full text content
* @param {number} maxTokens - Maximum number of tokens allowed
* @returns {string} - Sampled text content
*/
function sampleTextContent(text, maxTokens = TEXT_EXTRACTION_CONFIG.MAX_TOKENS) {
const currentTokens = estimateTokens(text);
if (currentTokens <= maxTokens) {
return text;
}
// Split text into sentences for better sampling
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
if (sentences.length === 0) {
const samplingRatio = maxTokens / currentTokens;
return text.substring(0, Math.floor(text.length * samplingRatio));
}
// Calculate how many sentences we can fit
const avgTokensPerSentence = currentTokens / sentences.length;
const targetSentenceCount = Math.floor(maxTokens / avgTokensPerSentence);
// Ensure we don't try to sample more sentences than exist
const actualTargetCount = Math.min(targetSentenceCount, sentences.length);
// Random distributed sampling strategy
const sampledSentences = randomDistributedSampling(sentences, actualTargetCount, maxTokens);
return sampledSentences.join('. ') + '.';
}
/**
* Performs random distributed sampling across the entire text
* @param {Array} sentences - Array of sentences
* @param {number} targetCount - Target number of sentences to sample
* @param {number} maxTokens - Maximum tokens allowed
* @returns {Array} - Array of sampled sentences
*/
function randomDistributedSampling(sentences, targetCount, maxTokens) {
if (sentences.length <= targetCount) {
return sentences;
}
const sampledSentences = [];
let sampledTokens = 0;
// Create indices for the entire sentence array
const allIndices = Array.from({ length: sentences.length }, (_, i) => i);
// Shuffle indices randomly
const shuffledIndices = shuffleArray([...allIndices]);
// Sample from shuffled indices
for (const index of shuffledIndices) {
const sentence = sentences[index].trim();
const sentenceTokens = estimateTokens(sentence);
if (sampledTokens + sentenceTokens <= maxTokens && sampledSentences.length < targetCount) {
sampledSentences.push(sentence);
sampledTokens += sentenceTokens;
}
// Stop if we've reached our target or token limit
if (sampledSentences.length >= targetCount || sampledTokens >= maxTokens * 0.95) {
break;
}
}
// If we still have room and haven't sampled enough, try to add more
if (sampledSentences.length < targetCount && sampledTokens < maxTokens * 0.9) {
const remainingIndices = shuffledIndices.filter(i => !sampledSentences.includes(sentences[i]));
for (const index of remainingIndices) {
const sentence = sentences[index].trim();
const sentenceTokens = estimateTokens(sentence);
if (sampledTokens + sentenceTokens <= maxTokens) {
sampledSentences.push(sentence);
sampledTokens += sentenceTokens;
}
if (sampledTokens >= maxTokens * 0.95) {
break;
}
}
}
return sampledSentences;
}
/**
* Shuffles an array using Fisher-Yates algorithm
* @param {Array} array - Array to shuffle
* @returns {Array} - Shuffled array
*/
function shuffleArray(array) {
const shuffled = [...array];
for (let i = shuffled.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
}
return shuffled;
}
/**
* Main function to extract page text content for quiz generation
* @param {Element} rootElement - Optional root element to extract from
* @returns {Object} - Object containing extracted text and metadata
*/
export function extractPageTextForQuiz(rootElement = document.body) {
try {
// Extract text content
const fullText = extractTextContent(rootElement);
if (!fullText) {
console.warn('No text content found on the page');
return {
text: '',
tokens: 0,
sampled: false,
originalTokens: 0
};
}
// Estimate tokens
const originalTokens = estimateTokens(fullText);
// Sample if necessary
const finalText = sampleTextContent(fullText, TEXT_EXTRACTION_CONFIG.MAX_TOKENS);
const finalTokens = estimateTokens(finalText);
const wasSampled = finalTokens < originalTokens;
console.log(`Text extraction complete:`, {
originalTokens,
finalTokens,
sampled: wasSampled,
textLength: finalText.length
});
return {
text: finalText,
tokens: finalTokens,
sampled: wasSampled,
originalTokens: originalTokens
};
} catch (error) {
console.error('Error extracting page text:', error);
return {
text: '',
tokens: 0,
sampled: false,
originalTokens: 0,
error: error.message
};
}
}
/**
* Main function to extract page text content with source mapping for cumulative quiz generation
* @param {Element} rootElement - Optional root element to extract from
* @returns {Object} - Object containing extracted text, sources, and metadata
*/
export function extractPageTextWithSourcesForQuiz(rootElement = document.body) {
try {
// Extract text content with source mapping
const extractionResult = extractTextContentWithSources(rootElement);
if (!extractionResult.text || extractionResult.sources.length === 0) {
console.warn('No text content or sources found on the page');
return {
text: '',
sources: [],
tokens: 0,
sampled: false,
originalTokens: 0
};
}
// Estimate tokens
const originalTokens = estimateTokens(extractionResult.text);
// Use source-aware sampling for better distribution
const samplingResult = sampleTextContentWithSources(extractionResult.sources, TEXT_EXTRACTION_CONFIG.MAX_TOKENS);
const finalText = samplingResult.text;
const finalTokens = samplingResult.tokens;
const wasSampled = finalTokens < originalTokens;
console.log(`Text extraction with sources complete:`, {
originalTokens,
finalTokens,
sampled: wasSampled,
textLength: finalText.length,
sourceCount: extractionResult.sources.length,
sampledSourceCount: samplingResult.sampledSources.length
});
return {
text: finalText,
sources: samplingResult.sampledSources,
tokens: finalTokens,
sampled: wasSampled,
originalTokens: originalTokens
};
} catch (error) {
console.error('Error extracting page text with sources:', error);
return {
text: '',
sources: [],
tokens: 0,
sampled: false,
originalTokens: 0,
error: error.message
};
}
}
/**
* Samples text content from multiple sources ensuring distribution across sources
* @param {Array} sources - Array of source objects with content
* @param {number} maxTokens - Maximum tokens allowed
* @returns {Object} - Object containing sampled text and sources
*/
function sampleTextContentWithSources(sources, maxTokens = TEXT_EXTRACTION_CONFIG.MAX_TOKENS) {
if (!sources || sources.length === 0) {
return { text: '', tokens: 0, sampledSources: [] };
}
// Calculate total tokens across all sources
const totalTokens = sources.reduce((sum, source) => sum + estimateTokens(source.content), 0);
if (totalTokens <= maxTokens) {
// No sampling needed
const combinedText = sources.map(source => `## ${source.label}\n\n${source.content}`).join('\n\n---\n\n');
return {
text: combinedText,
tokens: totalTokens,
sampledSources: sources
};
}
// Calculate tokens per source and determine sampling strategy
const sourceTokens = sources.map(source => ({
...source,
tokens: estimateTokens(source.content)
}));
// Sort sources by token count (largest first) for better distribution
sourceTokens.sort((a, b) => b.tokens - a.tokens);
const sampledSources = [];
let sampledTokens = 0;
// Distribute sampling across sources
const tokensPerSource = Math.floor(maxTokens / sources.length);
const minTokensPerSource = Math.floor(tokensPerSource * 0.5); // Ensure minimum representation
for (const source of sourceTokens) {
const targetTokens = Math.min(source.tokens, tokensPerSource);
if (sampledTokens + targetTokens <= maxTokens) {
// Sample content from this source
const sampledContent = sampleTextContent(source.content, targetTokens);
const actualTokens = estimateTokens(sampledContent);
if (sampledTokens + actualTokens <= maxTokens) {
sampledSources.push({
...source,
content: sampledContent,
tokens: actualTokens
});
sampledTokens += actualTokens;
}
}
// Stop if we're close to the limit
if (sampledTokens >= maxTokens * 0.95) {
break;
}
}
// If we still have room, try to add more content from sources that weren't fully sampled
if (sampledTokens < maxTokens * 0.8) {
const remainingSources = sourceTokens.filter(source =>
!sampledSources.some(sampled => sampled.sourceId === source.sourceId)
);
for (const source of remainingSources) {
const remainingTokens = maxTokens - sampledTokens;
if (remainingTokens > minTokensPerSource) {
const sampledContent = sampleTextContent(source.content, remainingTokens);
const actualTokens = estimateTokens(sampledContent);
if (sampledTokens + actualTokens <= maxTokens) {
sampledSources.push({
...source,
content: sampledContent,
tokens: actualTokens
});
sampledTokens += actualTokens;
}
}
}
}
// Combine sampled content
const combinedText = sampledSources.map(source => `## ${source.label}\n\n${source.content}`).join('\n\n---\n\n');
return {
text: combinedText,
tokens: sampledTokens,
sampledSources: sampledSources
};
}
/**
* Logs text extraction results for debugging
* @param {Object} extractionResult - Result from extractPageTextForQuiz or extractPageTextWithSourcesForQuiz
*/
export function logTextExtraction(extractionResult) {
// Log extraction results only if there's an error
if (extractionResult.error) {
console.error('Extraction Error:', extractionResult.error);
}
}
/**
* Gets the current page URL for reference
* @returns {string} - Current page URL
*/
export function getCurrentPageUrl() {
return window.location.href;
}
/**
* Gets the current page title for reference
* @returns {string} - Current page title
*/
export function getCurrentPageTitle() {
return document.title || 'Untitled Page';
}