MagpieTTS_Internal_Demo / .github /scripts /nemo_dependencies.py
subhankarg's picture
Upload folder using huggingface_hub
0558aa4 verified
#!/usr/bin/env python3
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
NeMo dependency structure definition.
This module analyzes the codebase to determine internal dependencies between NeMo collections and core components.
"""
import ast
import json
import os
from typing import Dict, List, Set
def find_python_files(directory: str) -> List[str]:
"""Find all Python files in the given directory and its subdirectories."""
python_files = []
# Look in nemo directory and other relevant directories
relevant_dirs = ['nemo', 'scripts', 'examples', 'tests']
for dir_name in relevant_dirs:
dir_path = os.path.join(directory, dir_name)
if os.path.exists(dir_path):
for root, _, files in os.walk(dir_path):
for file in files:
if file.endswith('.py'):
python_files.append(os.path.join(root, file))
return python_files
def analyze_imports(nemo_root: str, file_path: str) -> Set[str]:
"""Analyze a Python file and return its NeMo package dependencies using AST parsing."""
imports = set()
visited = set() # Track visited modules to prevent circular imports
def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]:
"""Recursively analyze imports from __init__.py files and map them to their final destinations."""
# Prevent infinite recursion
if depth > 10 or module_path in visited: # Limit depth to 10 levels
return {}
visited.add(module_path)
init_path = os.path.join(module_path, '__init__.py')
if not os.path.exists(init_path):
return {}
try:
with open(init_path, 'r', encoding='utf-8') as f:
init_tree = ast.parse(f.read(), filename=init_path)
import_map = {}
for node in ast.walk(init_tree):
if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
if node.names:
for name in node.names:
if name.name == '*':
continue
# Get the full module path for the import
module_parts = node.module.split('.')
module_dir = os.path.join(nemo_root, *module_parts)
# If the imported module has an __init__.py, recursively analyze it
if os.path.exists(os.path.join(module_dir, '__init__.py')):
sub_imports = get_init_imports(module_dir, depth + 1)
if name.name in sub_imports:
import_map[name.name] = sub_imports[name.name]
else:
# If not found in sub-imports, it might be from the module itself
module_file = os.path.join(module_dir, f"{module_parts[-1]}.py")
if os.path.exists(module_file):
import_map[name.name] = f"{node.module}.{name.name}"
else:
# Direct module import
import_map[name.name] = f"{node.module}.{name.name}"
return import_map
except Exception as e:
print(f"Error analyzing {init_path}: {e}")
return {}
try:
with open(file_path, 'r', encoding='utf-8') as f:
tree = ast.parse(f.read(), filename=file_path)
for node in ast.walk(tree):
if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
# Split the module path
parts = node.module.split('.')
if len(parts) == 1:
continue
if len(parts) >= 2:
module_type = parts[1]
if module_type == 'collections':
if len(parts) == 2:
continue
if node.names:
for name in node.names:
if name.name == '*':
continue
# Check if this is an __init__ import
module_path = os.path.join(nemo_root, *parts)
init_imports = get_init_imports(module_path)
if name.name in init_imports:
# Use the mapped import path
imports.add(init_imports[name.name])
else:
imports.add(f"{node.module}.{name.name}")
elif module_type in find_top_level_packages(nemo_root):
if node.names:
for name in node.names:
if name.name == '*':
continue
# Check if this is an __init__ import
module_path = os.path.join(nemo_root, *parts)
init_imports = get_init_imports(module_path)
if name.name in init_imports:
# Use the mapped import path
imports.add(init_imports[name.name])
else:
imports.add(f"{node.module}.{name.name}")
except Exception as e:
print(f"Error analyzing {file_path}: {e}")
return imports
def find_top_level_packages(nemo_root: str) -> List[str]:
"""Find all top-level packages under nemo directory."""
packages: List[str] = []
nemo_dir = os.path.join(nemo_root, 'nemo')
tests_dir = os.path.join(nemo_root, 'tests')
if not os.path.exists(nemo_dir):
print(f"Warning: nemo directory not found at {nemo_dir}")
return packages
if not os.path.exists(tests_dir):
print(f"Warning: nemo directory not found at {nemo_dir}")
return packages
for item in os.listdir(nemo_dir) + os.listdir(tests_dir):
item_path = os.path.join(nemo_dir, item)
if os.path.isdir(item_path) and not item.startswith('__'):
packages.append(item)
return sorted(packages)
def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]:
"""Find all modules within collections."""
collection_modules: Dict[str, List[str]] = {}
collections_dir = os.path.join(nemo_root, 'nemo', 'collections')
if not os.path.exists(collections_dir):
print(f"Warning: collections directory not found at {collections_dir}")
return collection_modules
for collection in os.listdir(collections_dir):
collection_path = os.path.join(collections_dir, collection)
if os.path.isdir(collection_path) and not collection.startswith('__'):
collection_modules[f"nemo.collections.{collection}"] = []
return collection_modules
def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]:
"""Build a dependency graph by analyzing all Python files."""
# Find all top-level packages
top_level_packages = find_top_level_packages(nemo_root)
print(f"Found top-level packages: {top_level_packages}")
dependencies: Dict[str, List[str]] = {}
for file_path in find_python_files(nemo_root):
relative_path = os.path.relpath(file_path, nemo_root)
parts = relative_path.split(os.sep)
if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"):
continue
module_path = relative_path.replace(".py", "").replace("/", ".")
if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests':
dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
elif parts[0] == 'tests':
dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")]
elif parts[1] == 'collections':
dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
# Flip the dependency graph to show reverse dependencies
reverse_dependencies: Dict[str, List[str]] = {}
# Handle top-level package dependencies
for package, deps in dependencies.items():
for dep in deps:
if dep not in reverse_dependencies:
reverse_dependencies[dep] = []
reverse_dependencies[dep].append(package)
dependencies = reverse_dependencies
# Follow and extend records with transitive dependencies
transitive_dependencies = dependencies.copy()
# Keep iterating until no new dependencies are added
while True:
changes_made = False
new_dependencies = transitive_dependencies.copy()
# For each package and its direct dependencies
for package, deps in transitive_dependencies.items():
# For each direct dependency
for dep in deps:
# If the dependency has its own dependencies
if dep in transitive_dependencies:
# Add those transitive dependencies to the original package
for transitive_dep in transitive_dependencies[dep]:
if transitive_dep not in new_dependencies[package]:
new_dependencies[package].append(transitive_dep)
changes_made = True
# Update dependencies with new transitive ones
transitive_dependencies = new_dependencies
# If no new dependencies were added, we're done
if not changes_made:
break
dependencies = transitive_dependencies
# Simplify values: Either top-level package or collection module
simplified_dependencies: Dict[str, List[str]] = {}
for package, deps in dependencies.items():
package_parts = package.split('.')
if package_parts[0] == "tests":
simplified_package_path = f"{os.path.join(*package_parts)}.py"
elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")):
simplified_package_path = file_path
elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")):
simplified_package_path = file_path
else:
simplified_package_path = package
for dep in deps:
dep_parts = dep.split('.')
if simplified_package_path not in simplified_dependencies:
simplified_dependencies[simplified_package_path] = []
if (
len(dep_parts) >= 2
and (dep_parts[1] in find_top_level_packages(nemo_root))
and dep_parts[1] != 'collections'
):
simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}")
elif dep_parts[0] == "tests":
simplified_dependencies[simplified_package_path].append(".".join(dep_parts))
elif len(dep_parts) >= 3 and (
simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}"
) in find_collection_modules(nemo_root):
simplified_dependencies[simplified_package_path].append(simplified_name)
simplified_dependencies[simplified_package_path].append(package)
simplified_dependencies[simplified_package_path] = sorted(
list(set(simplified_dependencies[simplified_package_path]))
)
dependencies = simplified_dependencies
# Bucket
bucket_deps: Dict[str, List[str]] = {}
for package, deps in dependencies.items():
new_deps = []
for dep in deps:
if (
"nemo.collections.asr" in dep
or "nemo.collections.tts" in dep
or "nemo.collections.speechlm" in dep
or "nemo.collections.audio" in dep
or "tests.collections.asr" in dep
or "tests.collections.tts" in dep
or "tests.collections.speechlm" in dep
or "tests.collections.audio" in dep
):
new_deps.append("speech")
new_deps.append("unit-tests")
if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep:
new_deps.append("export-deploy")
new_deps.append("unit-tests")
if (
"nemo.collections.llm" in dep
or "nemo.collections.vlm" in dep
or "nemo.automodel" in dep
or "tests.collections.llm" in dep
or "tests.collections.vlm" in dep
or "tests.automodel" in dep
):
new_deps.append("automodel")
new_deps.append("unit-tests")
if "tests" in dep and "tests.functional_tests" not in dep:
new_deps.append("unit-tests")
if (
"nemo.collections" in dep
and "nemo.collections.asr" not in dep
and "nemo.collections.tts" not in dep
and "nemo.collections.speechlm" not in dep
and "nemo.collections.audio" not in dep
and "tests.collections.asr" not in dep
and "tests.collections.tts" not in dep
and "tests.collections.speechlm" not in dep
and "tests.collections.audio" not in dep
):
new_deps.append("nemo2")
new_deps.append("unit-tests")
bucket_deps[package] = sorted(list(set(new_deps)))
dependencies = bucket_deps
# Additional dependencies
# Add all files in requirements/ directory
requirements_dir = os.path.join(nemo_root, "requirements")
if os.path.exists(requirements_dir):
for filename in os.listdir(requirements_dir):
filepath = os.path.join("requirements", filename)
relative_path = os.path.relpath(filepath, nemo_root)
dependencies[relative_path] = [
"nemo2",
"unit-tests",
"speech",
"automodel",
"export-deploy",
]
# Add all Dockerfile files
for root, _, files in os.walk(nemo_root):
for file_path in files:
full_path = os.path.join(root, file_path)
relative_path = os.path.relpath(full_path, nemo_root)
if "cicd-main-export-deploy" in file_path:
dependencies[relative_path] = ["export-deploy"]
if "cicd-main-nemo2" in file_path:
dependencies[relative_path] = ["nemo2"]
if "cicd-main-speech" in file_path:
dependencies[relative_path] = ["speech"]
if "cicd-main-automodel" in file_path:
dependencies[relative_path] = ["automodel"]
if "cicd-main-unit-tests" in file_path:
dependencies[relative_path] = ["unit-tests"]
if "Dockerfile" in file_path:
dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"]
# Sort dependencies by length of values (number of dependencies)
dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True))
return dependencies
def main():
"""Main function to analyze dependencies and output JSON."""
# Get the root directory of the NeMo project
nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Build dependency graph
dependencies = build_dependency_graph(nemo_root)
# Output as JSON
data = json.dumps(dependencies, indent=4)
with open('nemo_dependencies.json', 'w', encoding='utf-8') as f:
f.write(data)
if __name__ == "__main__":
main()