Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| NeMo dependency structure definition. | |
| This module analyzes the codebase to determine internal dependencies between NeMo collections and core components. | |
| """ | |
| import ast | |
| import json | |
| import os | |
| from typing import Dict, List, Set | |
| def find_python_files(directory: str) -> List[str]: | |
| """Find all Python files in the given directory and its subdirectories.""" | |
| python_files = [] | |
| # Look in nemo directory and other relevant directories | |
| relevant_dirs = ['nemo', 'scripts', 'examples', 'tests'] | |
| for dir_name in relevant_dirs: | |
| dir_path = os.path.join(directory, dir_name) | |
| if os.path.exists(dir_path): | |
| for root, _, files in os.walk(dir_path): | |
| for file in files: | |
| if file.endswith('.py'): | |
| python_files.append(os.path.join(root, file)) | |
| return python_files | |
| def analyze_imports(nemo_root: str, file_path: str) -> Set[str]: | |
| """Analyze a Python file and return its NeMo package dependencies using AST parsing.""" | |
| imports = set() | |
| visited = set() # Track visited modules to prevent circular imports | |
| def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]: | |
| """Recursively analyze imports from __init__.py files and map them to their final destinations.""" | |
| # Prevent infinite recursion | |
| if depth > 10 or module_path in visited: # Limit depth to 10 levels | |
| return {} | |
| visited.add(module_path) | |
| init_path = os.path.join(module_path, '__init__.py') | |
| if not os.path.exists(init_path): | |
| return {} | |
| try: | |
| with open(init_path, 'r', encoding='utf-8') as f: | |
| init_tree = ast.parse(f.read(), filename=init_path) | |
| import_map = {} | |
| for node in ast.walk(init_tree): | |
| if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): | |
| if node.names: | |
| for name in node.names: | |
| if name.name == '*': | |
| continue | |
| # Get the full module path for the import | |
| module_parts = node.module.split('.') | |
| module_dir = os.path.join(nemo_root, *module_parts) | |
| # If the imported module has an __init__.py, recursively analyze it | |
| if os.path.exists(os.path.join(module_dir, '__init__.py')): | |
| sub_imports = get_init_imports(module_dir, depth + 1) | |
| if name.name in sub_imports: | |
| import_map[name.name] = sub_imports[name.name] | |
| else: | |
| # If not found in sub-imports, it might be from the module itself | |
| module_file = os.path.join(module_dir, f"{module_parts[-1]}.py") | |
| if os.path.exists(module_file): | |
| import_map[name.name] = f"{node.module}.{name.name}" | |
| else: | |
| # Direct module import | |
| import_map[name.name] = f"{node.module}.{name.name}" | |
| return import_map | |
| except Exception as e: | |
| print(f"Error analyzing {init_path}: {e}") | |
| return {} | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| tree = ast.parse(f.read(), filename=file_path) | |
| for node in ast.walk(tree): | |
| if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): | |
| # Split the module path | |
| parts = node.module.split('.') | |
| if len(parts) == 1: | |
| continue | |
| if len(parts) >= 2: | |
| module_type = parts[1] | |
| if module_type == 'collections': | |
| if len(parts) == 2: | |
| continue | |
| if node.names: | |
| for name in node.names: | |
| if name.name == '*': | |
| continue | |
| # Check if this is an __init__ import | |
| module_path = os.path.join(nemo_root, *parts) | |
| init_imports = get_init_imports(module_path) | |
| if name.name in init_imports: | |
| # Use the mapped import path | |
| imports.add(init_imports[name.name]) | |
| else: | |
| imports.add(f"{node.module}.{name.name}") | |
| elif module_type in find_top_level_packages(nemo_root): | |
| if node.names: | |
| for name in node.names: | |
| if name.name == '*': | |
| continue | |
| # Check if this is an __init__ import | |
| module_path = os.path.join(nemo_root, *parts) | |
| init_imports = get_init_imports(module_path) | |
| if name.name in init_imports: | |
| # Use the mapped import path | |
| imports.add(init_imports[name.name]) | |
| else: | |
| imports.add(f"{node.module}.{name.name}") | |
| except Exception as e: | |
| print(f"Error analyzing {file_path}: {e}") | |
| return imports | |
| def find_top_level_packages(nemo_root: str) -> List[str]: | |
| """Find all top-level packages under nemo directory.""" | |
| packages: List[str] = [] | |
| nemo_dir = os.path.join(nemo_root, 'nemo') | |
| tests_dir = os.path.join(nemo_root, 'tests') | |
| if not os.path.exists(nemo_dir): | |
| print(f"Warning: nemo directory not found at {nemo_dir}") | |
| return packages | |
| if not os.path.exists(tests_dir): | |
| print(f"Warning: nemo directory not found at {nemo_dir}") | |
| return packages | |
| for item in os.listdir(nemo_dir) + os.listdir(tests_dir): | |
| item_path = os.path.join(nemo_dir, item) | |
| if os.path.isdir(item_path) and not item.startswith('__'): | |
| packages.append(item) | |
| return sorted(packages) | |
| def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]: | |
| """Find all modules within collections.""" | |
| collection_modules: Dict[str, List[str]] = {} | |
| collections_dir = os.path.join(nemo_root, 'nemo', 'collections') | |
| if not os.path.exists(collections_dir): | |
| print(f"Warning: collections directory not found at {collections_dir}") | |
| return collection_modules | |
| for collection in os.listdir(collections_dir): | |
| collection_path = os.path.join(collections_dir, collection) | |
| if os.path.isdir(collection_path) and not collection.startswith('__'): | |
| collection_modules[f"nemo.collections.{collection}"] = [] | |
| return collection_modules | |
| def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]: | |
| """Build a dependency graph by analyzing all Python files.""" | |
| # Find all top-level packages | |
| top_level_packages = find_top_level_packages(nemo_root) | |
| print(f"Found top-level packages: {top_level_packages}") | |
| dependencies: Dict[str, List[str]] = {} | |
| for file_path in find_python_files(nemo_root): | |
| relative_path = os.path.relpath(file_path, nemo_root) | |
| parts = relative_path.split(os.sep) | |
| if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"): | |
| continue | |
| module_path = relative_path.replace(".py", "").replace("/", ".") | |
| if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests': | |
| dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) | |
| elif parts[0] == 'tests': | |
| dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")] | |
| elif parts[1] == 'collections': | |
| dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) | |
| # Flip the dependency graph to show reverse dependencies | |
| reverse_dependencies: Dict[str, List[str]] = {} | |
| # Handle top-level package dependencies | |
| for package, deps in dependencies.items(): | |
| for dep in deps: | |
| if dep not in reverse_dependencies: | |
| reverse_dependencies[dep] = [] | |
| reverse_dependencies[dep].append(package) | |
| dependencies = reverse_dependencies | |
| # Follow and extend records with transitive dependencies | |
| transitive_dependencies = dependencies.copy() | |
| # Keep iterating until no new dependencies are added | |
| while True: | |
| changes_made = False | |
| new_dependencies = transitive_dependencies.copy() | |
| # For each package and its direct dependencies | |
| for package, deps in transitive_dependencies.items(): | |
| # For each direct dependency | |
| for dep in deps: | |
| # If the dependency has its own dependencies | |
| if dep in transitive_dependencies: | |
| # Add those transitive dependencies to the original package | |
| for transitive_dep in transitive_dependencies[dep]: | |
| if transitive_dep not in new_dependencies[package]: | |
| new_dependencies[package].append(transitive_dep) | |
| changes_made = True | |
| # Update dependencies with new transitive ones | |
| transitive_dependencies = new_dependencies | |
| # If no new dependencies were added, we're done | |
| if not changes_made: | |
| break | |
| dependencies = transitive_dependencies | |
| # Simplify values: Either top-level package or collection module | |
| simplified_dependencies: Dict[str, List[str]] = {} | |
| for package, deps in dependencies.items(): | |
| package_parts = package.split('.') | |
| if package_parts[0] == "tests": | |
| simplified_package_path = f"{os.path.join(*package_parts)}.py" | |
| elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")): | |
| simplified_package_path = file_path | |
| elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")): | |
| simplified_package_path = file_path | |
| else: | |
| simplified_package_path = package | |
| for dep in deps: | |
| dep_parts = dep.split('.') | |
| if simplified_package_path not in simplified_dependencies: | |
| simplified_dependencies[simplified_package_path] = [] | |
| if ( | |
| len(dep_parts) >= 2 | |
| and (dep_parts[1] in find_top_level_packages(nemo_root)) | |
| and dep_parts[1] != 'collections' | |
| ): | |
| simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}") | |
| elif dep_parts[0] == "tests": | |
| simplified_dependencies[simplified_package_path].append(".".join(dep_parts)) | |
| elif len(dep_parts) >= 3 and ( | |
| simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}" | |
| ) in find_collection_modules(nemo_root): | |
| simplified_dependencies[simplified_package_path].append(simplified_name) | |
| simplified_dependencies[simplified_package_path].append(package) | |
| simplified_dependencies[simplified_package_path] = sorted( | |
| list(set(simplified_dependencies[simplified_package_path])) | |
| ) | |
| dependencies = simplified_dependencies | |
| # Bucket | |
| bucket_deps: Dict[str, List[str]] = {} | |
| for package, deps in dependencies.items(): | |
| new_deps = [] | |
| for dep in deps: | |
| if ( | |
| "nemo.collections.asr" in dep | |
| or "nemo.collections.tts" in dep | |
| or "nemo.collections.speechlm" in dep | |
| or "nemo.collections.audio" in dep | |
| or "tests.collections.asr" in dep | |
| or "tests.collections.tts" in dep | |
| or "tests.collections.speechlm" in dep | |
| or "tests.collections.audio" in dep | |
| ): | |
| new_deps.append("speech") | |
| new_deps.append("unit-tests") | |
| if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep: | |
| new_deps.append("export-deploy") | |
| new_deps.append("unit-tests") | |
| if ( | |
| "nemo.collections.llm" in dep | |
| or "nemo.collections.vlm" in dep | |
| or "nemo.automodel" in dep | |
| or "tests.collections.llm" in dep | |
| or "tests.collections.vlm" in dep | |
| or "tests.automodel" in dep | |
| ): | |
| new_deps.append("automodel") | |
| new_deps.append("unit-tests") | |
| if "tests" in dep and "tests.functional_tests" not in dep: | |
| new_deps.append("unit-tests") | |
| if ( | |
| "nemo.collections" in dep | |
| and "nemo.collections.asr" not in dep | |
| and "nemo.collections.tts" not in dep | |
| and "nemo.collections.speechlm" not in dep | |
| and "nemo.collections.audio" not in dep | |
| and "tests.collections.asr" not in dep | |
| and "tests.collections.tts" not in dep | |
| and "tests.collections.speechlm" not in dep | |
| and "tests.collections.audio" not in dep | |
| ): | |
| new_deps.append("nemo2") | |
| new_deps.append("unit-tests") | |
| bucket_deps[package] = sorted(list(set(new_deps))) | |
| dependencies = bucket_deps | |
| # Additional dependencies | |
| # Add all files in requirements/ directory | |
| requirements_dir = os.path.join(nemo_root, "requirements") | |
| if os.path.exists(requirements_dir): | |
| for filename in os.listdir(requirements_dir): | |
| filepath = os.path.join("requirements", filename) | |
| relative_path = os.path.relpath(filepath, nemo_root) | |
| dependencies[relative_path] = [ | |
| "nemo2", | |
| "unit-tests", | |
| "speech", | |
| "automodel", | |
| "export-deploy", | |
| ] | |
| # Add all Dockerfile files | |
| for root, _, files in os.walk(nemo_root): | |
| for file_path in files: | |
| full_path = os.path.join(root, file_path) | |
| relative_path = os.path.relpath(full_path, nemo_root) | |
| if "cicd-main-export-deploy" in file_path: | |
| dependencies[relative_path] = ["export-deploy"] | |
| if "cicd-main-nemo2" in file_path: | |
| dependencies[relative_path] = ["nemo2"] | |
| if "cicd-main-speech" in file_path: | |
| dependencies[relative_path] = ["speech"] | |
| if "cicd-main-automodel" in file_path: | |
| dependencies[relative_path] = ["automodel"] | |
| if "cicd-main-unit-tests" in file_path: | |
| dependencies[relative_path] = ["unit-tests"] | |
| if "Dockerfile" in file_path: | |
| dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"] | |
| # Sort dependencies by length of values (number of dependencies) | |
| dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True)) | |
| return dependencies | |
| def main(): | |
| """Main function to analyze dependencies and output JSON.""" | |
| # Get the root directory of the NeMo project | |
| nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| # Build dependency graph | |
| dependencies = build_dependency_graph(nemo_root) | |
| # Output as JSON | |
| data = json.dumps(dependencies, indent=4) | |
| with open('nemo_dependencies.json', 'w', encoding='utf-8') as f: | |
| f.write(data) | |
| if __name__ == "__main__": | |
| main() | |