E2E_SCSI / train.csh
kungchuking's picture
Copied from github repository.
2c76547
#!/bin/csh
set arg_count = $#argv
if ( $arg_count >= 1 ) then
if ( "$argv[1]" == "-clean" || "$argv[1]" == "-clean_only" ) then
echo "[INFO] Killing alll other GPU processes to free up resources."
sh -c 'ps | grep python | sed "s/ pts.\+$//g" > .tmp.csh'
chmod +x .tmp.csh
sed -i "s/^/kill -9 /g" .tmp.csh
source .tmp.csh
rm -rf .tmp.csh
rm -rf debug_rank_*
rm -rf dynamicstereo_sf_dr
endif
if ( "$argv[1]" == "-clean_only" ) then
exit 0
endif
endif
setenv PYTORCH_CUDA_ALLOC_CONF "max_split_size_mb:32,garbage_collection_threshold:0.5,expandable_segments:False"
setenv CUDA_LAUNCH_BLOCKING 1
setenv PYTORCH_NO_CUDA_MEMORY_CACHING 1
setenv CUBLAS_WORKSPACE_CONFIG ":16:8"
setenv CUDA_VISIBLE_DEVICES 3
# -- GPU OOM Error when trained with sample_len=8 on kilby.
python train.py --batch_size 1 \
--image_size 480 640 --saturation_range 0 1.4 --num_steps 200000 \
--ckpt_path dynamicstereo_sf_dr \
--sample_len 8 --lr 0.0003 --train_iters 8 --valid_iters 8 \
--num_workers 28 --save_freq 100 --update_block_3d --different_update_blocks \
--attention_type self_stereo_temporal_update_time_update_space --train_datasets dynamic_replica