Spaces:
Runtime error
Runtime error
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import argparse | |
| import os | |
| from dataclasses import dataclass | |
| import torch | |
| from megatron.core.optimizer import OptimizerConfig | |
| from nemo import lightning as nl | |
| from nemo.collections import llm | |
| from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer | |
| ## NOTE: This script is present for github-actions testing only. | |
| def get_args(): | |
| parser = argparse.ArgumentParser(description='Pretraining a small BERT model using NeMo 2.0') | |
| parser.add_argument('--experiment_dir', type=str, help="directory to write results and checkpoints to") | |
| parser.add_argument('--devices', type=int, default=1, help="number of devices") | |
| parser.add_argument('--max_steps', type=int, default=3, help="number of devices") | |
| parser.add_argument('--mbs', type=int, default=1, help="micro batch size") | |
| parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size") | |
| parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size") | |
| parser.add_argument('--type', type=str, default='huggingface') | |
| return parser.parse_args() | |
| if __name__ == '__main__': | |
| args = get_args() | |
| strategy = nl.MegatronStrategy( | |
| tensor_model_parallel_size=args.tp_size, | |
| pipeline_model_parallel_size=args.pp_size, | |
| # Pipeline dtype is coupled with the bf16 mixed precision plugin | |
| pipeline_dtype=torch.bfloat16, | |
| ckpt_load_strictness="log_all", # Only for CI tests to use older versions of checkpoint | |
| ) | |
| trainer = nl.Trainer( | |
| devices=args.devices, | |
| max_steps=args.max_steps, | |
| accelerator="gpu", | |
| strategy=strategy, | |
| plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), | |
| log_every_n_steps=1, | |
| limit_val_batches=2, | |
| val_check_interval=2, | |
| num_sanity_val_steps=0, | |
| ) | |
| ckpt = nl.ModelCheckpoint( | |
| save_last=True, | |
| monitor="reduced_train_loss", | |
| save_top_k=1, | |
| save_on_train_epoch_end=True, | |
| save_optim_on_train_end=True, | |
| ) | |
| logger = nl.NeMoLogger( | |
| log_dir=args.experiment_dir, | |
| use_datetime_version=False, # must be false if using auto resume | |
| ckpt=ckpt, | |
| ) | |
| adam = nl.MegatronOptimizerModule( | |
| config=OptimizerConfig( | |
| optimizer="adam", | |
| lr=0.0001, | |
| adam_beta2=0.98, | |
| use_distributed_optimizer=True, | |
| clip_grad=1.0, | |
| bf16=True, | |
| ), | |
| ) | |
| data = llm.BERTMockDataModule( | |
| seq_length=512, | |
| micro_batch_size=args.mbs, | |
| global_batch_size=8, | |
| num_workers=0, | |
| ) | |
| tokenizer = get_nmt_tokenizer("megatron", "BertWordPieceLowerCase") | |
| if args.type == 'huggingface': | |
| print('Init HuggingFace Bert Base Model') | |
| model = llm.BertModel(llm.HuggingFaceBertBaseConfig(), tokenizer=tokenizer) | |
| elif args.type == 'megatron': | |
| print('Init Megatron Bert Base Model') | |
| model = llm.BertModel(llm.MegatronBertBaseConfig(), tokenizer=tokenizer) | |
| else: | |
| raise ValueError('Unknown type.') | |
| resume = nl.AutoResume( | |
| resume_if_exists=True, | |
| resume_ignore_no_checkpoint=True, | |
| ) | |
| llm.pretrain(model=model, data=data, trainer=trainer, log=logger, optim=adam, resume=resume) | |
| print("Bert Pretraining Succeeded") | |