""" This script is used to download the DL3DV-10 dataset for all resolution levels from the huggingface repo. As the whole dataset is too large for most users, we provide this script so that you can download the dataset efficiently based on your needs. We provide several options to download the dataset (image frames with poses): - [X] Resolution level: 4K, 2K, 960P, 480P - [X] Subset of the 10K, e.g. 1K(0~1K), 2K(1K~2K), 3K(2K~3K), etc - [X] specific hash - [X] file_type: raw video | images+poses | colmap cache Notes: - file_type + resolution will decide which dataset repo to download the files - subset will decide which subdir will be used - if hash is set, only the specific hash will be downloaded """ import os from os.path import join import pandas as pd from tqdm import tqdm from huggingface_hub import HfApi import argparse import traceback import shutil import urllib.request import zipfile from huggingface_hub import HfFileSystem from multiprocessing import Process api = HfApi() resolution2repo = { '480P': 'DL3DV/DL3DV-ALL-480P', '960P': 'DL3DV/DL3DV-ALL-960P', '2K': 'DL3DV/DL3DV-ALL-2K', '4K': 'DL3DV/DL3DV-ALL-4K' } def verify_access(repo: str): """ This function can be used to verify if the user has access to the repo. :param repo: the repo name :return: True if the user has access, False otherwise """ fs = HfFileSystem() try: fs.ls(f'datasets/{repo}') return True except BaseException as e: return False def hf_download_path(repo: str, rel_path: str, odir: str, max_try: int = 5): """ hf api is not reliable, retry when failed with max tries :param repo: The huggingface dataset repo :param rel_path: The relative path in the repo :param odir: output path :param max_try: As the downloading is not a reliable process, we will retry for max_try times """ counter = 0 while True: if counter >= max_try: print(f"ERROR: Download {repo}/{rel_path} failed.") return False try: api.hf_hub_download(repo_id=repo, filename=rel_path, repo_type='dataset', local_dir=odir, cache_dir=join(odir, '.cache')) return True except KeyboardInterrupt: print('Keyboard Interrupt. Exit.') exit() except BaseException as e: traceback.print_exc() counter += 1 # print(f'Downloading summary {counter}') def download_from_url(url: str, ofile: str): """ Download a file from the url to ofile :param url: The url link :param ofile: The output path :return: True if download success, False otherwise """ try: # Use urllib.request.urlretrieve to download the file from `url` and save it locally at `local_file_path` urllib.request.urlretrieve(url, ofile) return True except Exception as e: print(f"An error occurred while downloading the file: {e}") return False def clean_huggingface_cache(output_dir: str, repo: str): """ Huggingface cache may take too much space, we clean the cache to save space if necessary Current huggingface hub does not provide good practice to clean the space. We mannually clean the cache directory if necessary. :param output_dir: the current output directory :param output_dir: the huggingface repo """ repo_cache_dir = repo.replace('/', '--') # cur_cache_dir = join(output_dir, '.cache', f'datasets--{repo_cache_dir}') cur_cache_dir = join(output_dir, '.cache') if os.path.exists(cur_cache_dir): shutil.rmtree(cur_cache_dir) def get_download_list(subset_opt: str, hash_name: str, reso_opt: str, file_type: str, output_dir: str): """ Get the download list based on the subset and hash name 1. Get the meta file 2. Select the subset. Based on reso_opt, get the downloading list prepared. 3. Return the download list. :param subset_opt: Subset of the 10K, e.g. 1K(0~1K), 2K(1K~2K), 3K(2K~3K), etc :param hash_name: If provided a non-empty string, ignore the subset_opt and only download the specific hash :param reso_opt: The resolution to download. :param file_type: The file type to download: video | images+poses | colmap_cache :param output_dir: The output directory. """ def to_download_item(hash_name, reso, batch, file_type): if file_type == 'images+poses': repo = resolution2repo[reso] rel_path = f'{batch}/{hash_name}.zip' elif file_type == 'video': repo = 'DL3DV/DL3DV-ALL-video' rel_path = f'{batch}/{hash_name}/video.mp4' elif file_type == 'colmap_cache': repo = 'DL3DV/DL3DV-ALL-ColmapCache' rel_path = f'{batch}/{hash_name}.zip' # return f'{repo}/{batch}/{hash_name}' return { 'repo': repo, 'rel_path': rel_path } ret = [] meta_link = 'https://raw.githubusercontent.com/DL3DV-10K/Dataset/main/cache/DL3DV-valid.csv' cache_folder = join(output_dir, '.cache') meta_file = join(cache_folder, 'DL3DV-valid.csv') os.makedirs(cache_folder, exist_ok=True) if not os.path.exists(meta_file): assert download_from_url(meta_link, meta_file), 'Download meta file failed.' df = pd.read_csv(meta_file) # if hash is set, ignore the subset_opt if hash_name != '': assert hash_name in df['hash'].values, f'Hash {hash_name} not found in the meta file.' batch = df[df['hash'] == hash_name]['batch'].values[0] link = to_download_item(hash_name, reso_opt, batch, file_type) ret = [link] return ret # if hash not set, we download the whole subset subdf = df[df['batch'] == subset_opt] for i, r in subdf.iterrows(): hash_name = r['hash'] ret.append(to_download_item(hash_name, reso_opt, subset_opt, file_type)) return ret def download(download_list: list, output_dir: str, is_clean_cache: bool): """ Download the dataset based on the download_list and user options. :param download_list: the list of files to download, [{'repo', 'rel_path'}] :param output_dir: the output directory :param reso_opt: the resolution option :param is_clean_cache: if set, will clean the huggingface cache to save space """ succ_count = 0 for item in tqdm(download_list, desc='Downloading'): repo = item['repo'] rel_path = item['rel_path'] output_path = os.path.join(output_dir, rel_path) output_path = output_path.replace('.zip', '') # skip if already exists locally if os.path.exists(output_path): succ_count += 1 continue succ = hf_download_path(repo, rel_path, output_dir) if succ: succ_count += 1 if is_clean_cache: clean_huggingface_cache(output_dir, repo) # unzip the file if rel_path.endswith('.zip'): zip_file = join(output_dir, rel_path) hash_name = os.path.basename(rel_path).replace('.zip', '') # Create target directory: output_dir/batch/hash_name ofile = join(output_dir, os.path.dirname(rel_path), hash_name) os.makedirs(ofile, exist_ok=True) with zipfile.ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(ofile) # Check if there's a nested hash/hash/ structure inner_hash_dir = join(ofile, hash_name) if os.path.exists(inner_hash_dir) and os.path.isdir(inner_hash_dir): # Move all contents from inner hash dir to outer hash dir for item in os.listdir(inner_hash_dir): src = join(inner_hash_dir, item) dst = join(ofile, item) shutil.move(src, dst) # Remove the now-empty inner hash directory os.rmdir(inner_hash_dir) os.remove(zip_file) else: print(f'Download {rel_path} failed') print(f'Summary: {succ_count}/{len(download_list)} files downloaded successfully') return succ_count == len(download_list) def download_dataset(args): """ Download the dataset based on the user inputs. :param args: argparse args. Used to decide the subset. :return: download success or not """ output_dir = args.odir subset_opt = args.subset reso_opt = args.resolution hash_name = args.hash file_type = args.file_type is_clean_cache = args.clean_cache os.makedirs(output_dir, exist_ok=True) download_list = get_download_list(subset_opt, hash_name, reso_opt, file_type, output_dir) return download(download_list, output_dir, is_clean_cache) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--odir', type=str, help='output directory', required=True) parser.add_argument('--subset', choices=['1K', '2K', '3K', '4K', '5K', '6K', '7K', '8K', '9K', '10K', '11K', 'all'], help='The subset of the benchmark to download', required=True) parser.add_argument('--resolution', choices=['4K', '2K', '960P', '480P'], help='The resolution to donwnload', required=True) parser.add_argument('--file_type', choices=['images+poses', 'video', 'colmap_cache'], help='The file type to download', required=True, default='images+poses') parser.add_argument('--hash', type=str, help='If set subset=hash, this is the hash code of the scene to download', default='') parser.add_argument('--clean_cache', action='store_true', help='If set, will clean the huggingface cache to save space') params = parser.parse_args() assert params.file_type in ['images+poses', 'video', 'colmap_cache'], 'Check the file_type input.' if params.file_type == 'images+poses': repo = resolution2repo[params.resolution] elif params.file_type == 'video': repo = 'DL3DV/DL3DV-ALL-video' elif params.file_type == 'colmap_cache': repo = 'DL3DV/DL3DV-ALL-ColmapCache' if not verify_access(repo): print(f'You have not grant the access yet. Go to relevant huggingface repo (https://huggingface.co/datasets/{repo}) and apply for the access.') exit(1) # Handle 'all' subset option: download all 11 subsets in parallel if params.subset == 'all': subsets = ['1K', '2K', '3K', '4K', '5K', '6K', '7K', '8K', '9K', '10K', '11K'] processes = [] print(f'Downloading all 11 subsets in parallel...') for subset in subsets: # Create a copy of params for each subprocess subset_params = argparse.Namespace( odir=params.odir, subset=subset, resolution=params.resolution, file_type=params.file_type, hash=params.hash, clean_cache=params.clean_cache ) p = Process(target=download_dataset, args=(subset_params,)) p.start() processes.append(p) print(f'Started process for subset {subset}') # Wait for all processes to complete for p in processes: p.join() print('All downloads completed. Refer to', params.odir) else: # Single subset download if download_dataset(params): print('Download Done. Refer to', params.odir) else: print(f'Download to {params.odir} failed. See error messsage.')