Spaces:
Running
Running
| import unittest | |
| import re | |
| from langchain_core.messages import HumanMessage | |
| from cuga.backend.llm.models import LLMManager | |
| from cuga.config import settings | |
| class TestLongOutput(unittest.IsolatedAsyncioTestCase): | |
| """ | |
| Test class for verifying that LLM can generate long outputs (at least 1600 tokens). | |
| This tests that max_tokens is properly set and working using LLMManager directly. | |
| """ | |
| async def test_long_llm_output(self): | |
| """Test that LLM can generate outputs of at least 1600 tokens.""" | |
| # Get model configuration | |
| model_config = settings.agent.code.model.copy() | |
| # Verify max_tokens is set correctly (should be 16000 for Groq) | |
| max_tokens_config = getattr(model_config, 'max_tokens', None) | |
| self.assertIsNotNone(max_tokens_config, "max_tokens not found in model configuration") | |
| self.assertGreater( | |
| max_tokens_config, 1000, f"max_tokens too low: {max_tokens_config}, should be > 1000" | |
| ) | |
| self.assertNotEqual( | |
| max_tokens_config, | |
| 1000, | |
| "max_tokens is still set to default 1000 - this indicates the fix didn't work", | |
| ) | |
| print(f"\n=== Testing Long Output with max_tokens={max_tokens_config} ===") | |
| # Initialize LLM manager and get model | |
| llm_manager = LLMManager() | |
| model = llm_manager.get_model(model_config) | |
| # Verify model has correct max_tokens set | |
| model_max_tokens = getattr(model, 'max_tokens', None) | |
| if model_max_tokens: | |
| print(f"Model max_tokens attribute: {model_max_tokens}") | |
| # Note: Some models may store this in model_kwargs instead | |
| # Create a prompt that should generate a very long response | |
| prompt = ( | |
| "Write a comprehensive, detailed analysis of artificial intelligence, " | |
| "covering its history from the 1950s to present day, major breakthroughs, " | |
| "current state-of-the-art techniques, ethical considerations, future implications, " | |
| "and potential societal impacts. Include specific examples, technical details, " | |
| "and references to key researchers and organizations. Make this analysis " | |
| "as thorough and detailed as possible, aiming for at least 2000 words. " | |
| "Be very detailed and comprehensive in your response." | |
| ) | |
| print("Sending prompt to LLM...") | |
| try: | |
| # Call the LLM directly | |
| messages = [HumanMessage(content=prompt)] | |
| response = await model.ainvoke(messages) | |
| # Extract the response text | |
| if hasattr(response, 'content'): | |
| answer_text = response.content | |
| else: | |
| answer_text = str(response) | |
| self.assertIsNotNone(answer_text, "Response is None") | |
| self.assertNotEqual(answer_text.strip(), "", "Response is empty") | |
| print(f"Response length: {len(answer_text)} characters") | |
| # Count approximate tokens | |
| # More accurate: count words (rough approximation) | |
| words = re.findall(r'\b\w+\b', answer_text) | |
| approx_tokens = len(words) | |
| # Also estimate based on characters (1 token β 4 chars for English) | |
| char_based_estimate = len(answer_text) // 4 | |
| print(f"Approximate token count (word-based): {approx_tokens}") | |
| print(f"Approximate token count (char-based): {char_based_estimate}") | |
| print(f"Acccurate token count: {response.response_metadata}") | |
| # Use the higher estimate to be conservative | |
| final_estimate = max(approx_tokens, char_based_estimate) | |
| # Assert that we have at least 1600 tokens worth of content | |
| self.assertGreaterEqual( | |
| final_estimate, | |
| 1600, | |
| f"Response too short: {final_estimate} tokens (estimated), expected at least 1600. " | |
| f"This suggests max_tokens may not be set correctly. " | |
| f"Config max_tokens={max_tokens_config}, Model max_tokens={model_max_tokens}", | |
| ) | |
| print(f"β Response meets minimum length requirement: {final_estimate} tokens (estimated)") | |
| # Check if response appears truncated | |
| truncated_indicators = [ | |
| "...", | |
| "truncated", | |
| "cut off", | |
| "incomplete", | |
| "continues", | |
| "to be continued", | |
| ] | |
| lower_answer = answer_text.lower() | |
| has_truncation_indicator = any( | |
| indicator in lower_answer[-200:] for indicator in truncated_indicators | |
| ) | |
| if has_truncation_indicator and final_estimate < 2000: | |
| print("β οΈ Response may be truncated (found truncation indicators)") | |
| else: | |
| print("β Response appears complete") | |
| # Print a sample of the response | |
| print("\n--- Response Sample (first 500 chars) ---") | |
| print(answer_text[:500] + "..." if len(answer_text) > 500 else answer_text) | |
| except Exception as e: | |
| self.fail(f"Test failed with exception: {e}") | |
| def test_max_tokens_from_config(self): | |
| """Test that max_tokens is correctly read from configuration.""" | |
| # Get the current model configuration | |
| model_config = settings.agent.code.model | |
| # Verify max_tokens is set and is a reasonable value | |
| max_tokens = getattr(model_config, 'max_tokens', None) | |
| self.assertIsNotNone(max_tokens, "max_tokens not found in model configuration") | |
| self.assertGreater(max_tokens, 1000, f"max_tokens too low: {max_tokens}, should be > 1000") | |
| print(f"β Model configuration has max_tokens = {max_tokens}") | |
| # Verify it's not the default 1000 that was causing the issue | |
| self.assertNotEqual( | |
| max_tokens, 1000, "max_tokens is still set to default 1000 - this indicates the fix didn't work" | |
| ) | |
| # Verify LLMManager extracts it correctly | |
| llm_manager = LLMManager() | |
| model_config_copy = model_config.copy() | |
| # This should not raise an assertion error | |
| try: | |
| llm_manager.get_model(model_config_copy) | |
| print(f"β LLMManager.get_model() successfully used max_tokens={max_tokens} from config") | |
| except AssertionError as e: | |
| if "max_tokens must be specified" in str(e): | |
| self.fail(f"LLMManager failed to extract max_tokens from config: {e}") | |
| raise | |
| if __name__ == "__main__": | |
| unittest.main() | |