| import typing as tp | |
| from transformers import PretrainedConfig | |
| class GiddConfig(PretrainedConfig): | |
| model_type: str = "gidd" | |
| def __init__( | |
| self, | |
| vocab_size: int = 131072, | |
| hidden_size: int = 768, | |
| intermediate_size: int = 3072, | |
| num_hidden_layers: int = 12, | |
| num_attention_heads: int = 12, | |
| head_dim: tp.Optional[int] = None, | |
| is_causal: bool = False, | |
| attn_soft_cap: float = 30.0, | |
| max_position_embeddings: int = 1024, | |
| resid_scale: float = 4.0, | |
| rms_norm_eps: float = 1e-6, | |
| use_qk_norm: bool = True, | |
| init_scale: float = 0.4, | |
| emb_init_scale: float = 0.1, | |
| head_init_scale: float = 0.0, | |
| weight_scaling: str = "fan_in", | |
| head_scaling: float = 1.0, | |
| bos_token_id: int = 0, | |
| eos_token_id: int = 1, | |
| rope_theta: float = 10000.0, | |
| rope_scaling: tp.Dict[str, tp.Union[str, float]] = None, | |
| attention_bias: bool = False, | |
| mlp_bias: bool = False, | |
| tie_word_embeddings: bool = False, | |
| attn_performer: str = "eager", | |
| noise_type: float = 0.0, | |
| min_log_snr: float = -9.0, | |
| max_log_snr: float = 9.0, | |
| **kwargs, | |
| ): | |
| super().__init__( | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| **kwargs, | |
| ) | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.intermediate_size = intermediate_size | |
| self.num_hidden_layers = num_hidden_layers | |
| self.rope_theta = rope_theta | |
| self.num_attention_heads = num_attention_heads | |
| self.attn_soft_cap = attn_soft_cap | |
| self.is_causal = is_causal | |
| self.max_position_embeddings = max_position_embeddings | |
| self.resid_scale = resid_scale | |
| self.init_scale = init_scale | |
| self.emb_init_scale = emb_init_scale | |
| self.head_init_scale = head_init_scale | |
| self.weight_scaling = weight_scaling | |
| self.head_scaling = head_scaling | |
| self.rms_norm_eps = rms_norm_eps | |
| self.use_qk_norm = use_qk_norm | |
| self.attention_bias = attention_bias | |
| self.mlp_bias = mlp_bias | |
| self.rope_scaling = rope_scaling | |
| self.head_dim = ( | |
| head_dim if head_dim is not None else hidden_size // num_attention_heads | |
| ) | |
| self.tie_word_embeddings = tie_word_embeddings | |
| self.attn_performer = attn_performer | |
| self.noise_type = noise_type | |
| self.min_log_snr = min_log_snr | |
| self.max_log_snr = max_log_snr | |