cs249r_book/socratiq/src_shadow/js/utils/textExtractor.js

// textExtractor.js
import { TEXT_EXTRACTION_CONFIG } from '../../configs/client.config.js';

/**
 * Estimates the number of tokens in a text string
 * Rough approximation: 1 token ≈ 4 characters for English text
 * @param {string} text - The text to estimate tokens for
 * @returns {number} - Estimated token count
 */
function estimateTokens(text) {
  if (!text) return 0;
  // Remove extra whitespace and count characters
  const cleanText = text.replace(/\s+/g, ' ').trim();
  return Math.ceil(cleanText.length / 4);
}

/**
 * Checks if an element should be excluded from text extraction
 * @param {Element} element - The DOM element to check
 * @returns {boolean} - True if element should be excluded
 */
function shouldExcludeElement(element) {
  // Check if element matches any navigation selectors
  const navSelectors = TEXT_EXTRACTION_CONFIG.NAV_SELECTORS;

  for (const selector of navSelectors) {
    if (element.matches(selector)) {
      return true;
    }
  }

  // Check if element is hidden
  const style = window.getComputedStyle(element);
  if (style.display === 'none' || style.visibility === 'hidden') {
    return true;
  }

  // Check for common navigation attributes
  const role = element.getAttribute('role');
  if (role && ['navigation', 'banner', 'contentinfo'].includes(role)) {
    return true;
  }

  return false;
}

/**
 * Extracts text content from a DOM element, excluding navigation components
 * @param {Element} rootElement - The root element to extract text from (defaults to document.body)
 * @returns {string} - Extracted text content
 */
function extractTextContent(rootElement = document.body) {
  if (!rootElement) {
    console.warn('No root element provided for text extraction');
    return '';
  }

  // Clone the element to avoid modifying the original DOM
  const clonedElement = rootElement.cloneNode(true);

  // Remove navigation elements from the clone
  TEXT_EXTRACTION_CONFIG.NAV_SELECTORS.forEach(selector => {
    const navElements = clonedElement.querySelectorAll(selector);
    navElements.forEach(el => el.remove());
  });

  // Remove elements with navigation roles
  const navRoleElements = clonedElement.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"]');
  navRoleElements.forEach(el => el.remove());

  // Remove script and style elements
  const scriptElements = clonedElement.querySelectorAll('script, style');
  scriptElements.forEach(el => el.remove());

  // Get text content
  let textContent = clonedElement.textContent || '';

  // Clean up the text
  textContent = textContent
    .replace(/\s+/g, ' ') // Replace multiple whitespace with single space
    .replace(/\n\s*\n/g, '\n') // Remove empty lines
    .trim();

  return textContent;
}

/**
 * Extracts text content with source mapping for reference tracking
 * @param {Element} rootElement - The root element to extract text from (defaults to document.body)
 * @returns {Object} - Object containing text content and source mapping
 */
function extractTextContentWithSources(rootElement = document.body) {
  if (!rootElement) {
    console.warn('No root element provided for text extraction');
    return { text: '', sources: [] };
  }

  // Clone the element to avoid modifying the original DOM
  const clonedElement = rootElement.cloneNode(true);

  // Remove navigation elements from the clone
  TEXT_EXTRACTION_CONFIG.NAV_SELECTORS.forEach(selector => {
    const navElements = clonedElement.querySelectorAll(selector);
    navElements.forEach(el => el.remove());
  });

  // Remove elements with navigation roles
  const navRoleElements = clonedElement.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"]');
  navRoleElements.forEach(el => el.remove());

  // Remove script and style elements
  const scriptElements = clonedElement.querySelectorAll('script, style');
  scriptElements.forEach(el => el.remove());

  // Find content sections and extract with source mapping
  const sources = [];
  const contentSections = [];

  // Look for main content areas first
  const mainContentSelectors = TEXT_EXTRACTION_CONFIG.CONTENT_SELECTORS;
  let contentElements = [];

  for (const selector of mainContentSelectors) {
    const elements = clonedElement.querySelectorAll(selector);
    if (elements.length > 0) {
      contentElements = Array.from(elements);
      break;
    }
  }

  // If no main content found, use the entire cloned element
  if (contentElements.length === 0) {
    contentElements = [clonedElement];
  }

  // Process each content element
  contentElements.forEach((element, index) => {
    const elementId = element.id || `content-section-${index}`;
    const elementClass = element.className || '';
    const elementTag = element.tagName.toLowerCase();

    // Get text content from this element
    let textContent = element.textContent || '';

    // Clean up the text
    textContent = textContent
      .replace(/\s+/g, ' ')
      .replace(/\n\s*\n/g, '\n')
      .trim();

    if (textContent.length > 50) { // Only include substantial content
      // Create source mapping
      const source = {
        sourceId: `source-${index}`,
        label: elementId || `${elementTag}-${index}`,
        content: textContent,
        pageUrl: window.location.href,
        domain: window.location.hostname,
        level: 'page',
        position: index,
        elementId: elementId,
        elementClass: elementClass,
        elementTag: elementTag
      };

      sources.push(source);
      contentSections.push(`## ${source.label}\n\n${source.content}`);
    }
  });

  // Combine all content sections
  const combinedText = contentSections.join('\n\n---\n\n');

  return {
    text: combinedText,
    sources: sources
  };
}

/**
 * Samples text content to fit within token limits using random distributed sampling
 * @param {string} text - The full text content
 * @param {number} maxTokens - Maximum number of tokens allowed
 * @returns {string} - Sampled text content
 */
function sampleTextContent(text, maxTokens = TEXT_EXTRACTION_CONFIG.MAX_TOKENS) {
  const currentTokens = estimateTokens(text);

  if (currentTokens <= maxTokens) {
    return text;
  }


  // Split text into sentences for better sampling
  const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);

  if (sentences.length === 0) {
    const samplingRatio = maxTokens / currentTokens;
    return text.substring(0, Math.floor(text.length * samplingRatio));
  }

  // Calculate how many sentences we can fit
  const avgTokensPerSentence = currentTokens / sentences.length;
  const targetSentenceCount = Math.floor(maxTokens / avgTokensPerSentence);

  // Ensure we don't try to sample more sentences than exist
  const actualTargetCount = Math.min(targetSentenceCount, sentences.length);

  // Random distributed sampling strategy
  const sampledSentences = randomDistributedSampling(sentences, actualTargetCount, maxTokens);

  return sampledSentences.join('. ') + '.';
}

/**
 * Performs random distributed sampling across the entire text
 * @param {Array} sentences - Array of sentences
 * @param {number} targetCount - Target number of sentences to sample
 * @param {number} maxTokens - Maximum tokens allowed
 * @returns {Array} - Array of sampled sentences
 */
function randomDistributedSampling(sentences, targetCount, maxTokens) {
  if (sentences.length <= targetCount) {
    return sentences;
  }

  const sampledSentences = [];
  let sampledTokens = 0;

  // Create indices for the entire sentence array
  const allIndices = Array.from({ length: sentences.length }, (_, i) => i);

  // Shuffle indices randomly
  const shuffledIndices = shuffleArray([...allIndices]);

  // Sample from shuffled indices
  for (const index of shuffledIndices) {
    const sentence = sentences[index].trim();
    const sentenceTokens = estimateTokens(sentence);

    if (sampledTokens + sentenceTokens <= maxTokens && sampledSentences.length < targetCount) {
      sampledSentences.push(sentence);
      sampledTokens += sentenceTokens;
    }

    // Stop if we've reached our target or token limit
    if (sampledSentences.length >= targetCount || sampledTokens >= maxTokens * 0.95) {
      break;
    }
  }

  // If we still have room and haven't sampled enough, try to add more
  if (sampledSentences.length < targetCount && sampledTokens < maxTokens * 0.9) {
    const remainingIndices = shuffledIndices.filter(i => !sampledSentences.includes(sentences[i]));

    for (const index of remainingIndices) {
      const sentence = sentences[index].trim();
      const sentenceTokens = estimateTokens(sentence);

      if (sampledTokens + sentenceTokens <= maxTokens) {
        sampledSentences.push(sentence);
        sampledTokens += sentenceTokens;
      }

      if (sampledTokens >= maxTokens * 0.95) {
        break;
      }
    }
  }

  return sampledSentences;
}

/**
 * Shuffles an array using Fisher-Yates algorithm
 * @param {Array} array - Array to shuffle
 * @returns {Array} - Shuffled array
 */
function shuffleArray(array) {
  const shuffled = [...array];
  for (let i = shuffled.length - 1; i > 0; i--) {
    const j = Math.floor(Math.random() * (i + 1));
    [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
  }
  return shuffled;
}

/**
 * Main function to extract page text content for quiz generation
 * @param {Element} rootElement - Optional root element to extract from
 * @returns {Object} - Object containing extracted text and metadata
 */
export function extractPageTextForQuiz(rootElement = document.body) {
  try {

    // Extract text content
    const fullText = extractTextContent(rootElement);

    if (!fullText) {
      console.warn('No text content found on the page');
      return {
        text: '',
        tokens: 0,
        sampled: false,
        originalTokens: 0
      };
    }

    // Estimate tokens
    const originalTokens = estimateTokens(fullText);

    // Sample if necessary
    const finalText = sampleTextContent(fullText, TEXT_EXTRACTION_CONFIG.MAX_TOKENS);
    const finalTokens = estimateTokens(finalText);
    const wasSampled = finalTokens < originalTokens;

    console.log(`Text extraction complete:`, {
      originalTokens,
      finalTokens,
      sampled: wasSampled,
      textLength: finalText.length
    });

    return {
      text: finalText,
      tokens: finalTokens,
      sampled: wasSampled,
      originalTokens: originalTokens
    };

  } catch (error) {
    console.error('Error extracting page text:', error);
    return {
      text: '',
      tokens: 0,
      sampled: false,
      originalTokens: 0,
      error: error.message
    };
  }
}

/**
 * Main function to extract page text content with source mapping for cumulative quiz generation
 * @param {Element} rootElement - Optional root element to extract from
 * @returns {Object} - Object containing extracted text, sources, and metadata
 */
export function extractPageTextWithSourcesForQuiz(rootElement = document.body) {
  try {

    // Extract text content with source mapping
    const extractionResult = extractTextContentWithSources(rootElement);

    if (!extractionResult.text || extractionResult.sources.length === 0) {
      console.warn('No text content or sources found on the page');
      return {
        text: '',
        sources: [],
        tokens: 0,
        sampled: false,
        originalTokens: 0
      };
    }

    // Estimate tokens
    const originalTokens = estimateTokens(extractionResult.text);

    // Use source-aware sampling for better distribution
    const samplingResult = sampleTextContentWithSources(extractionResult.sources, TEXT_EXTRACTION_CONFIG.MAX_TOKENS);
    const finalText = samplingResult.text;
    const finalTokens = samplingResult.tokens;
    const wasSampled = finalTokens < originalTokens;

    console.log(`Text extraction with sources complete:`, {
      originalTokens,
      finalTokens,
      sampled: wasSampled,
      textLength: finalText.length,
      sourceCount: extractionResult.sources.length,
      sampledSourceCount: samplingResult.sampledSources.length
    });

    return {
      text: finalText,
      sources: samplingResult.sampledSources,
      tokens: finalTokens,
      sampled: wasSampled,
      originalTokens: originalTokens
    };

  } catch (error) {
    console.error('Error extracting page text with sources:', error);
    return {
      text: '',
      sources: [],
      tokens: 0,
      sampled: false,
      originalTokens: 0,
      error: error.message
    };
  }
}

/**
 * Samples text content from multiple sources ensuring distribution across sources
 * @param {Array} sources - Array of source objects with content
 * @param {number} maxTokens - Maximum tokens allowed
 * @returns {Object} - Object containing sampled text and sources
 */
function sampleTextContentWithSources(sources, maxTokens = TEXT_EXTRACTION_CONFIG.MAX_TOKENS) {
  if (!sources || sources.length === 0) {
    return { text: '', tokens: 0, sampledSources: [] };
  }

  // Calculate total tokens across all sources
  const totalTokens = sources.reduce((sum, source) => sum + estimateTokens(source.content), 0);

  if (totalTokens <= maxTokens) {
    // No sampling needed
    const combinedText = sources.map(source => `## ${source.label}\n\n${source.content}`).join('\n\n---\n\n');
    return {
      text: combinedText,
      tokens: totalTokens,
      sampledSources: sources
    };
  }


  // Calculate tokens per source and determine sampling strategy
  const sourceTokens = sources.map(source => ({
    ...source,
    tokens: estimateTokens(source.content)
  }));

  // Sort sources by token count (largest first) for better distribution
  sourceTokens.sort((a, b) => b.tokens - a.tokens);

  const sampledSources = [];
  let sampledTokens = 0;

  // Distribute sampling across sources
  const tokensPerSource = Math.floor(maxTokens / sources.length);
  const minTokensPerSource = Math.floor(tokensPerSource * 0.5); // Ensure minimum representation

  for (const source of sourceTokens) {
    const targetTokens = Math.min(source.tokens, tokensPerSource);

    if (sampledTokens + targetTokens <= maxTokens) {
      // Sample content from this source
      const sampledContent = sampleTextContent(source.content, targetTokens);
      const actualTokens = estimateTokens(sampledContent);

      if (sampledTokens + actualTokens <= maxTokens) {
        sampledSources.push({
          ...source,
          content: sampledContent,
          tokens: actualTokens
        });
        sampledTokens += actualTokens;
      }
    }

    // Stop if we're close to the limit
    if (sampledTokens >= maxTokens * 0.95) {
      break;
    }
  }

  // If we still have room, try to add more content from sources that weren't fully sampled
  if (sampledTokens < maxTokens * 0.8) {
    const remainingSources = sourceTokens.filter(source =>
      !sampledSources.some(sampled => sampled.sourceId === source.sourceId)
    );

    for (const source of remainingSources) {
      const remainingTokens = maxTokens - sampledTokens;
      if (remainingTokens > minTokensPerSource) {
        const sampledContent = sampleTextContent(source.content, remainingTokens);
        const actualTokens = estimateTokens(sampledContent);

        if (sampledTokens + actualTokens <= maxTokens) {
          sampledSources.push({
            ...source,
            content: sampledContent,
            tokens: actualTokens
          });
          sampledTokens += actualTokens;
        }
      }
    }
  }

  // Combine sampled content
  const combinedText = sampledSources.map(source => `## ${source.label}\n\n${source.content}`).join('\n\n---\n\n');


  return {
    text: combinedText,
    tokens: sampledTokens,
    sampledSources: sampledSources
  };
}

/**
 * Logs text extraction results for debugging
 * @param {Object} extractionResult - Result from extractPageTextForQuiz or extractPageTextWithSourcesForQuiz
 */
export function logTextExtraction(extractionResult) {
  // Log extraction results only if there's an error
  if (extractionResult.error) {
    console.error('Extraction Error:', extractionResult.error);
  }
}

/**
 * Gets the current page URL for reference
 * @returns {string} - Current page URL
 */
export function getCurrentPageUrl() {
  return window.location.href;
}

/**
 * Gets the current page title for reference
 * @returns {string} - Current page title
 */
export function getCurrentPageTitle() {
  return document.title || 'Untitled Page';
}