import json from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document from datetime import datetime def process_transcript_to_documents( transcript_text, speaker_data, meeting_id, meeting_metadata=None, min_chunk_size=1500, # Increased from 500 for better RAG context max_chunk_size=3000, # Increased from 1500 for richer chunks chunk_overlap=200 # Increased from 100 for better continuity ): """ Process transcript text and speaker data into LangChain Documents with semantic grouping. Groups consecutive speaker segments into meaningful chunks with rich metadata for better RAG. Args: transcript_text (str): The full transcript text. speaker_data (list): List of dictionaries containing segment info (text, start, end, speaker). meeting_id (str): Unique identifier for the meeting. meeting_metadata (dict, optional): Additional metadata (meeting_date, source_file, etc.). min_chunk_size (int): Minimum characters per chunk (default: 1500). max_chunk_size (int): Maximum characters per chunk (default: 3000). chunk_overlap (int): Character overlap between chunks (default: 200). Returns: list[Document]: List of processed LangChain Documents with rich metadata. """ if not speaker_data: # Fallback: use RecursiveCharacterTextSplitter on raw text return _fallback_chunking(transcript_text, meeting_id, meeting_metadata, min_chunk_size, max_chunk_size, chunk_overlap) # Initialize metadata defaults meeting_metadata = meeting_metadata or {} # Group segments into meaningful chunks chunks = [] current_chunk = { "text": "", "speaker": None, "speakers": set(), "start_time": None, "end_time": None, "segment_count": 0 } def finalize_chunk(): """Finalize the current chunk and add to chunks list.""" if current_chunk["text"].strip(): chunks.append({ "text": current_chunk["text"].strip(), "speaker": current_chunk["speaker"], "speakers": list(current_chunk["speakers"]), "start_time": current_chunk["start_time"], "end_time": current_chunk["end_time"], "segment_count": current_chunk["segment_count"] }) # Reset current chunk current_chunk["text"] = "" current_chunk["speaker"] = None current_chunk["speakers"] = set() current_chunk["start_time"] = None current_chunk["end_time"] = None current_chunk["segment_count"] = 0 # Process segments with semantic grouping for segment in speaker_data: text = segment.get("text", "").strip() if not text: continue speaker = segment.get("speaker", "UNKNOWN") start = segment.get("start", 0) end = segment.get("end", 0) # Initialize chunk if empty if current_chunk["speaker"] is None: current_chunk["speaker"] = speaker current_chunk["start_time"] = start # Check if we should finalize the current chunk current_length = len(current_chunk["text"]) new_length = current_length + len(text) + 1 # +1 for space should_finalize = False # Finalize if we exceed max_chunk_size if new_length > max_chunk_size and current_length >= min_chunk_size: should_finalize = True # Finalize if speaker changes AND we've met min_chunk_size elif speaker != current_chunk["speaker"] and current_length >= min_chunk_size: should_finalize = True if should_finalize: finalize_chunk() # Start new chunk with current segment current_chunk["speaker"] = speaker current_chunk["start_time"] = start # Add segment to current chunk if current_chunk["text"]: current_chunk["text"] += " " + text else: current_chunk["text"] = text current_chunk["speakers"].add(speaker) current_chunk["end_time"] = end current_chunk["segment_count"] += 1 # Finalize the last chunk finalize_chunk() # Apply overlap between chunks chunks_with_overlap = _apply_overlap(chunks, chunk_overlap) # Convert chunks to LangChain Documents with rich metadata documents = [] total_chunks = len(chunks_with_overlap) for idx, chunk in enumerate(chunks_with_overlap): # Build comprehensive metadata with all available fields # Note: Pinecone only accepts string/number/boolean/list metadata, so we convert dicts to JSON strings speaker_mapping = meeting_metadata.get("speaker_mapping", {}) speaker_mapping_json = json.dumps(speaker_mapping) if speaker_mapping else "{}" # Convert dict to JSON string metadata = { # Meeting Identification "meeting_id": meeting_id, "meeting_date": meeting_metadata.get("meeting_date", datetime.now().strftime("%Y-%m-%d")), "meeting_title": meeting_metadata.get("meeting_title", ""), "summary": meeting_metadata.get("summary", ""), # ✅ Added summary # Temporal Information "start_time": chunk["start_time"], "end_time": chunk["end_time"], "duration": chunk["end_time"] - chunk["start_time"], "start_time_formatted": _format_timestamp(chunk["start_time"]), "end_time_formatted": _format_timestamp(chunk["end_time"]), "meeting_duration": meeting_metadata.get("duration", "N/A"), # ✅ Added total meeting duration # Speaker Information "speaker": chunk["speaker"], "speakers": chunk["speakers"], "speaker_count": len(chunk["speakers"]), "speaker_mapping": speaker_mapping_json, # ✅ Converted to JSON string for Pinecone compatibility # Content Metadata "chunk_type": "conversation_turn" if len(chunk["speakers"]) == 1 else "mixed_speakers", "chunk_index": idx, "total_chunks": total_chunks, "word_count": len(chunk["text"].split()), "char_count": len(chunk["text"]), "segment_count": chunk["segment_count"], # Source Information "source": meeting_metadata.get("source", "unknown"), # ✅ Added source type "source_file": meeting_metadata.get("source_file", ""), "transcription_model": meeting_metadata.get("transcription_model", "whisperx"), "language": meeting_metadata.get("language", "en"), "date_transcribed": meeting_metadata.get("date_transcribed", datetime.now().strftime("%Y-%m-%d")), # ✅ Added transcription date } doc = Document(page_content=chunk["text"], metadata=metadata) documents.append(doc) return documents def _apply_overlap(chunks, overlap_size): """ Apply overlap between consecutive chunks by including trailing text from previous chunk. Args: chunks (list): List of chunk dictionaries. overlap_size (int): Number of characters to overlap. Returns: list: Chunks with overlap applied. """ if overlap_size <= 0 or len(chunks) <= 1: return chunks overlapped_chunks = [chunks[0]] # First chunk has no overlap for i in range(1, len(chunks)): current = chunks[i].copy() previous = chunks[i - 1] # Get overlap text from previous chunk overlap_text = previous["text"][-overlap_size:].strip() # Prepend overlap to current chunk if overlap_text: current["text"] = overlap_text + " " + current["text"] # Update start_time to include overlap context (keep previous chunk's end region) # Note: We keep the original start_time for temporal accuracy overlapped_chunks.append(current) return overlapped_chunks def _fallback_chunking(transcript_text, meeting_id, meeting_metadata, min_chunk_size, max_chunk_size, chunk_overlap): """ Fallback chunking when no speaker data is available. Uses RecursiveCharacterTextSplitter on the raw transcript. Args: transcript_text (str): Full transcript text. meeting_id (str): Meeting identifier. meeting_metadata (dict): Meeting metadata. min_chunk_size (int): Minimum chunk size. max_chunk_size (int): Maximum chunk size. chunk_overlap (int): Overlap size. Returns: list[Document]: Chunked documents. """ meeting_metadata = meeting_metadata or {} text_splitter = RecursiveCharacterTextSplitter( chunk_size=max_chunk_size, chunk_overlap=chunk_overlap ) # Create comprehensive base metadata with consistent field names # Note: Pinecone only accepts string/number/boolean/list metadata, so we convert dicts to JSON strings speaker_mapping = meeting_metadata.get("speaker_mapping", {}) speaker_mapping_json = json.dumps(speaker_mapping) if speaker_mapping else "{}" # Convert dict to JSON string base_metadata = { "meeting_id": meeting_id, "meeting_date": meeting_metadata.get("meeting_date", datetime.now().strftime("%Y-%m-%d")), "meeting_title": meeting_metadata.get("meeting_title", ""), "summary": meeting_metadata.get("summary", ""), # ✅ Added summary "chunk_type": "full_transcript_chunk", "source": meeting_metadata.get("source", "unknown"), # ✅ Added source "source_file": meeting_metadata.get("source_file", ""), "transcription_model": meeting_metadata.get("transcription_model", "whisperx"), "language": meeting_metadata.get("language", "en"), "date_transcribed": meeting_metadata.get("date_transcribed", datetime.now().strftime("%Y-%m-%d")), # ✅ Added transcription date "speaker_mapping": speaker_mapping_json, # ✅ Converted to JSON string for Pinecone compatibility "meeting_duration": meeting_metadata.get("duration", "N/A"), # ✅ Added duration } # Split text into chunks texts = text_splitter.split_text(transcript_text) # Create documents with metadata documents = [] total_chunks = len(texts) for idx, text in enumerate(texts): metadata = base_metadata.copy() metadata.update({ "chunk_index": idx, "total_chunks": total_chunks, "word_count": len(text.split()), "char_count": len(text), }) doc = Document(page_content=text, metadata=metadata) documents.append(doc) return documents def _format_timestamp(seconds): """ Convert seconds to MM:SS format. Args: seconds (float): Time in seconds. Returns: str: Formatted timestamp (MM:SS). """ if seconds is None: return "00:00" minutes = int(seconds // 60) secs = int(seconds % 60) return f"{minutes:02d}:{secs:02d}"