| #!/bin/csh | |
| set arg_count = $#argv | |
| if ( $arg_count >= 1 ) then | |
| if ( "$argv[1]" == "-clean" || "$argv[1]" == "-clean_only" ) then | |
| echo "[INFO] Killing alll other GPU processes to free up resources." | |
| sh -c 'ps | grep python | sed "s/ pts.\+$//g" > .tmp.csh' | |
| chmod +x .tmp.csh | |
| sed -i "s/^/kill -9 /g" .tmp.csh | |
| source .tmp.csh | |
| rm -rf .tmp.csh | |
| rm -rf debug_rank_* | |
| rm -rf dynamicstereo_sf_dr | |
| endif | |
| if ( "$argv[1]" == "-clean_only" ) then | |
| exit 0 | |
| endif | |
| endif | |
| setenv PYTORCH_CUDA_ALLOC_CONF "max_split_size_mb:32,garbage_collection_threshold:0.5,expandable_segments:False" | |
| setenv CUDA_LAUNCH_BLOCKING 1 | |
| setenv PYTORCH_NO_CUDA_MEMORY_CACHING 1 | |
| setenv CUBLAS_WORKSPACE_CONFIG ":16:8" | |
| setenv CUDA_VISIBLE_DEVICES 3 | |
| # -- GPU OOM Error when trained with sample_len=8 on kilby. | |
| python train.py --batch_size 1 \ | |
| --image_size 480 640 --saturation_range 0 1.4 --num_steps 200000 \ | |
| --ckpt_path dynamicstereo_sf_dr \ | |
| --sample_len 8 --lr 0.0003 --train_iters 8 --valid_iters 8 \ | |
| --num_workers 28 --save_freq 100 --update_block_3d --different_update_blocks \ | |
| --attention_type self_stereo_temporal_update_time_update_space --train_datasets dynamic_replica | |