yuvis's picture
Upload folder using huggingface_hub
f4c70c8 verified
raw
history blame contribute delete
657 Bytes
from typing import List
class Chunker:
def chunk(self, text: str) -> List[str]:
raise NotImplementedError
class SlidingWindowChunker(Chunker):
def __init__(self, chunk_size: int = 512, overlap: int = 50):
self.chunk_size = chunk_size
self.overlap = overlap
def chunk(self, text: str) -> List[str]:
words = text.split()
chunks = []
for i in range(0, len(words), self.chunk_size - self.overlap):
chunk_words = words[i : i + self.chunk_size]
chunks.append(" ".join(chunk_words))
if i + self.chunk_size >= len(words):
break
return chunks