mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-08 02:28:25 -05:00
59 lines
2.2 KiB
JavaScript
59 lines
2.2 KiB
JavaScript
|
|
// Define the CustomTextSplitter class
|
|
export class CustomTextSplitter {
|
|
constructor(interval, overlap, charToSplit = '\n\n') {
|
|
this.interval = interval;
|
|
this.overlap = overlap;
|
|
this.charToSplit = charToSplit;
|
|
}
|
|
|
|
split(text) {
|
|
const result = [];
|
|
const lines = text.split(this.charToSplit);
|
|
|
|
let currentChunk = [];
|
|
let currentSize = 0;
|
|
|
|
lines.forEach(line => {
|
|
const tokens = line.match(/\w+|[^\w\s]+/g) || [];
|
|
tokens.forEach(token => {
|
|
// Check if adding this token would exceed the interval size
|
|
if (currentSize + token.length + 1 > this.interval) {
|
|
// If so, push the current chunk to the result and prepare a new chunk with overlap
|
|
result.push(currentChunk.join(' '));
|
|
// Start new chunk with the last 'overlap' tokens from the current chunk
|
|
currentChunk = currentChunk.slice(-this.overlap);
|
|
currentSize = currentChunk.join(' ').length + 1; // Recalculate the size of the new chunk
|
|
}
|
|
|
|
// Add the token to the current chunk and update the size
|
|
currentChunk.push(token);
|
|
currentSize += token.length + 1; // Add one for the space that follows the token
|
|
});
|
|
});
|
|
|
|
// Add the last chunk if it contains any tokens
|
|
if (currentChunk.length > 0) {
|
|
result.push(currentChunk.join(' '));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// // Example text to split
|
|
// const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur tincidunt magna ut justo gravida, sed gravida nisi euismod. Vivamus nec dictum libero.";
|
|
|
|
// // Instantiate the text splitter with specific interval and overlap values
|
|
// const intervalSize = 50; // Example interval size, can be adjusted to your needs
|
|
// const overlapSize = 5; // Example overlap size, adjust as needed
|
|
|
|
// const textSplitter = new CustomTextSplitter(intervalSize, overlapSize);
|
|
|
|
// // Split the text and print each chunk
|
|
// const chunks = textSplitter.split(text);
|
|
// console.log("Chunks of text:");
|
|
// chunks.forEach((chunk, index) => {
|
|
// console.log(`Chunk ${index + 1}: ${chunk}`);
|
|
// });
|