#!/usr/bin/env python3 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ NeMo dependency structure definition. This module analyzes the codebase to determine internal dependencies between NeMo collections and core components. """ import ast import json import os from typing import Dict, List, Set def find_python_files(directory: str) -> List[str]: """Find all Python files in the given directory and its subdirectories.""" python_files = [] # Look in nemo directory and other relevant directories relevant_dirs = ['nemo', 'scripts', 'examples', 'tests'] for dir_name in relevant_dirs: dir_path = os.path.join(directory, dir_name) if os.path.exists(dir_path): for root, _, files in os.walk(dir_path): for file in files: if file.endswith('.py'): python_files.append(os.path.join(root, file)) return python_files def analyze_imports(nemo_root: str, file_path: str) -> Set[str]: """Analyze a Python file and return its NeMo package dependencies using AST parsing.""" imports = set() visited = set() # Track visited modules to prevent circular imports def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]: """Recursively analyze imports from __init__.py files and map them to their final destinations.""" # Prevent infinite recursion if depth > 10 or module_path in visited: # Limit depth to 10 levels return {} visited.add(module_path) init_path = os.path.join(module_path, '__init__.py') if not os.path.exists(init_path): return {} try: with open(init_path, 'r', encoding='utf-8') as f: init_tree = ast.parse(f.read(), filename=init_path) import_map = {} for node in ast.walk(init_tree): if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): if node.names: for name in node.names: if name.name == '*': continue # Get the full module path for the import module_parts = node.module.split('.') module_dir = os.path.join(nemo_root, *module_parts) # If the imported module has an __init__.py, recursively analyze it if os.path.exists(os.path.join(module_dir, '__init__.py')): sub_imports = get_init_imports(module_dir, depth + 1) if name.name in sub_imports: import_map[name.name] = sub_imports[name.name] else: # If not found in sub-imports, it might be from the module itself module_file = os.path.join(module_dir, f"{module_parts[-1]}.py") if os.path.exists(module_file): import_map[name.name] = f"{node.module}.{name.name}" else: # Direct module import import_map[name.name] = f"{node.module}.{name.name}" return import_map except Exception as e: print(f"Error analyzing {init_path}: {e}") return {} try: with open(file_path, 'r', encoding='utf-8') as f: tree = ast.parse(f.read(), filename=file_path) for node in ast.walk(tree): if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): # Split the module path parts = node.module.split('.') if len(parts) == 1: continue if len(parts) >= 2: module_type = parts[1] if module_type == 'collections': if len(parts) == 2: continue if node.names: for name in node.names: if name.name == '*': continue # Check if this is an __init__ import module_path = os.path.join(nemo_root, *parts) init_imports = get_init_imports(module_path) if name.name in init_imports: # Use the mapped import path imports.add(init_imports[name.name]) else: imports.add(f"{node.module}.{name.name}") elif module_type in find_top_level_packages(nemo_root): if node.names: for name in node.names: if name.name == '*': continue # Check if this is an __init__ import module_path = os.path.join(nemo_root, *parts) init_imports = get_init_imports(module_path) if name.name in init_imports: # Use the mapped import path imports.add(init_imports[name.name]) else: imports.add(f"{node.module}.{name.name}") except Exception as e: print(f"Error analyzing {file_path}: {e}") return imports def find_top_level_packages(nemo_root: str) -> List[str]: """Find all top-level packages under nemo directory.""" packages: List[str] = [] nemo_dir = os.path.join(nemo_root, 'nemo') tests_dir = os.path.join(nemo_root, 'tests') if not os.path.exists(nemo_dir): print(f"Warning: nemo directory not found at {nemo_dir}") return packages if not os.path.exists(tests_dir): print(f"Warning: nemo directory not found at {nemo_dir}") return packages for item in os.listdir(nemo_dir) + os.listdir(tests_dir): item_path = os.path.join(nemo_dir, item) if os.path.isdir(item_path) and not item.startswith('__'): packages.append(item) return sorted(packages) def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]: """Find all modules within collections.""" collection_modules: Dict[str, List[str]] = {} collections_dir = os.path.join(nemo_root, 'nemo', 'collections') if not os.path.exists(collections_dir): print(f"Warning: collections directory not found at {collections_dir}") return collection_modules for collection in os.listdir(collections_dir): collection_path = os.path.join(collections_dir, collection) if os.path.isdir(collection_path) and not collection.startswith('__'): collection_modules[f"nemo.collections.{collection}"] = [] return collection_modules def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]: """Build a dependency graph by analyzing all Python files.""" # Find all top-level packages top_level_packages = find_top_level_packages(nemo_root) print(f"Found top-level packages: {top_level_packages}") dependencies: Dict[str, List[str]] = {} for file_path in find_python_files(nemo_root): relative_path = os.path.relpath(file_path, nemo_root) parts = relative_path.split(os.sep) if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"): continue module_path = relative_path.replace(".py", "").replace("/", ".") if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests': dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) elif parts[0] == 'tests': dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")] elif parts[1] == 'collections': dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) # Flip the dependency graph to show reverse dependencies reverse_dependencies: Dict[str, List[str]] = {} # Handle top-level package dependencies for package, deps in dependencies.items(): for dep in deps: if dep not in reverse_dependencies: reverse_dependencies[dep] = [] reverse_dependencies[dep].append(package) dependencies = reverse_dependencies # Follow and extend records with transitive dependencies transitive_dependencies = dependencies.copy() # Keep iterating until no new dependencies are added while True: changes_made = False new_dependencies = transitive_dependencies.copy() # For each package and its direct dependencies for package, deps in transitive_dependencies.items(): # For each direct dependency for dep in deps: # If the dependency has its own dependencies if dep in transitive_dependencies: # Add those transitive dependencies to the original package for transitive_dep in transitive_dependencies[dep]: if transitive_dep not in new_dependencies[package]: new_dependencies[package].append(transitive_dep) changes_made = True # Update dependencies with new transitive ones transitive_dependencies = new_dependencies # If no new dependencies were added, we're done if not changes_made: break dependencies = transitive_dependencies # Simplify values: Either top-level package or collection module simplified_dependencies: Dict[str, List[str]] = {} for package, deps in dependencies.items(): package_parts = package.split('.') if package_parts[0] == "tests": simplified_package_path = f"{os.path.join(*package_parts)}.py" elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")): simplified_package_path = file_path elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")): simplified_package_path = file_path else: simplified_package_path = package for dep in deps: dep_parts = dep.split('.') if simplified_package_path not in simplified_dependencies: simplified_dependencies[simplified_package_path] = [] if ( len(dep_parts) >= 2 and (dep_parts[1] in find_top_level_packages(nemo_root)) and dep_parts[1] != 'collections' ): simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}") elif dep_parts[0] == "tests": simplified_dependencies[simplified_package_path].append(".".join(dep_parts)) elif len(dep_parts) >= 3 and ( simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}" ) in find_collection_modules(nemo_root): simplified_dependencies[simplified_package_path].append(simplified_name) simplified_dependencies[simplified_package_path].append(package) simplified_dependencies[simplified_package_path] = sorted( list(set(simplified_dependencies[simplified_package_path])) ) dependencies = simplified_dependencies # Bucket bucket_deps: Dict[str, List[str]] = {} for package, deps in dependencies.items(): new_deps = [] for dep in deps: if ( "nemo.collections.asr" in dep or "nemo.collections.tts" in dep or "nemo.collections.speechlm" in dep or "nemo.collections.audio" in dep or "tests.collections.asr" in dep or "tests.collections.tts" in dep or "tests.collections.speechlm" in dep or "tests.collections.audio" in dep ): new_deps.append("speech") new_deps.append("unit-tests") if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep: new_deps.append("export-deploy") new_deps.append("unit-tests") if ( "nemo.collections.llm" in dep or "nemo.collections.vlm" in dep or "nemo.automodel" in dep or "tests.collections.llm" in dep or "tests.collections.vlm" in dep or "tests.automodel" in dep ): new_deps.append("automodel") new_deps.append("unit-tests") if "tests" in dep and "tests.functional_tests" not in dep: new_deps.append("unit-tests") if ( "nemo.collections" in dep and "nemo.collections.asr" not in dep and "nemo.collections.tts" not in dep and "nemo.collections.speechlm" not in dep and "nemo.collections.audio" not in dep and "tests.collections.asr" not in dep and "tests.collections.tts" not in dep and "tests.collections.speechlm" not in dep and "tests.collections.audio" not in dep ): new_deps.append("nemo2") new_deps.append("unit-tests") bucket_deps[package] = sorted(list(set(new_deps))) dependencies = bucket_deps # Additional dependencies # Add all files in requirements/ directory requirements_dir = os.path.join(nemo_root, "requirements") if os.path.exists(requirements_dir): for filename in os.listdir(requirements_dir): filepath = os.path.join("requirements", filename) relative_path = os.path.relpath(filepath, nemo_root) dependencies[relative_path] = [ "nemo2", "unit-tests", "speech", "automodel", "export-deploy", ] # Add all Dockerfile files for root, _, files in os.walk(nemo_root): for file_path in files: full_path = os.path.join(root, file_path) relative_path = os.path.relpath(full_path, nemo_root) if "cicd-main-export-deploy" in file_path: dependencies[relative_path] = ["export-deploy"] if "cicd-main-nemo2" in file_path: dependencies[relative_path] = ["nemo2"] if "cicd-main-speech" in file_path: dependencies[relative_path] = ["speech"] if "cicd-main-automodel" in file_path: dependencies[relative_path] = ["automodel"] if "cicd-main-unit-tests" in file_path: dependencies[relative_path] = ["unit-tests"] if "Dockerfile" in file_path: dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"] # Sort dependencies by length of values (number of dependencies) dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True)) return dependencies def main(): """Main function to analyze dependencies and output JSON.""" # Get the root directory of the NeMo project nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Build dependency graph dependencies = build_dependency_graph(nemo_root) # Output as JSON data = json.dumps(dependencies, indent=4) with open('nemo_dependencies.json', 'w', encoding='utf-8') as f: f.write(data) if __name__ == "__main__": main()