Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

config.json +41 -0
configuration_gidd.py +74 -0
generation_config.json +6 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +487 -0
modeling_gidd.py +1134 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +140 -0

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "architectures": [
+    "GiddForDiffusionLM"
+  ],
+  "attention_bias": true,
+  "attn_performer": "eager",
+  "attn_soft_cap": 30.0,
+  "auto_map": {
+    "AutoConfig": "configuration_gidd.GiddConfig",
+    "AutoModel": "modeling_gidd.GiddModel",
+    "AutoModelForCausalLM": "modeling_gidd.GiddForDiffusionLM"
+  },
+  "bos_token_id": 0,
+  "emb_init_scale": 0.1,
+  "eos_token_id": 1,
+  "head_dim": 128,
+  "head_init_scale": 0.0,
+  "head_scaling": 0.1111111111111111,
+  "hidden_size": 4608,
+  "init_scale": 0.005892556509887897,
+  "intermediate_size": 18432,
+  "is_causal": false,
+  "max_log_snr": 9.0,
+  "max_position_embeddings": 2048,
+  "min_log_snr": -9.0,
+  "mlp_bias": true,
+  "model_type": "gidd",
+  "noise_type": 1000.0,
+  "num_attention_heads": 36,
+  "num_hidden_layers": 34,
+  "resid_scale": 4.0,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.0",
+  "use_qk_norm": true,
+  "vocab_size": 131072,
+  "weight_scaling": 1.0
+}

configuration_gidd.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import typing as tp
+from transformers import PretrainedConfig
+class GiddConfig(PretrainedConfig):
+    model_type: str = "gidd"
+    def __init__(
+        self,
+        vocab_size: int = 131072,
+        hidden_size: int = 768,
+        intermediate_size: int = 3072,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        head_dim: tp.Optional[int] = None,
+        is_causal: bool = False,
+        attn_soft_cap: float = 30.0,
+        max_position_embeddings: int = 1024,
+        resid_scale: float = 4.0,
+        rms_norm_eps: float = 1e-6,
+        use_qk_norm: bool = True,
+        init_scale: float = 0.4,
+        emb_init_scale: float = 0.1,
+        head_init_scale: float = 0.0,
+        weight_scaling: str = "fan_in",
+        head_scaling: float = 1.0,
+        bos_token_id: int = 0,
+        eos_token_id: int = 1,
+        rope_theta: float = 10000.0,
+        rope_scaling: tp.Dict[str, tp.Union[str, float]] = None,
+        attention_bias: bool = False,
+        mlp_bias: bool = False,
+        tie_word_embeddings: bool = False,
+        attn_performer: str = "eager",
+        noise_type: float = 0.0,
+        min_log_snr: float = -9.0,
+        max_log_snr: float = 9.0,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.rope_theta = rope_theta
+        self.num_attention_heads = num_attention_heads
+        self.attn_soft_cap = attn_soft_cap
+        self.is_causal = is_causal
+        self.max_position_embeddings = max_position_embeddings
+        self.resid_scale = resid_scale
+        self.init_scale = init_scale
+        self.emb_init_scale = emb_init_scale
+        self.head_init_scale = head_init_scale
+        self.weight_scaling = weight_scaling
+        self.head_scaling = head_scaling
+        self.rms_norm_eps = rms_norm_eps
+        self.use_qk_norm = use_qk_norm
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.rope_scaling = rope_scaling
+        self.head_dim = (
+            head_dim if head_dim is not None else hidden_size // num_attention_heads
+        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.attn_performer = attn_performer
+        self.noise_type = noise_type
+        self.min_log_snr = min_log_snr
+        self.max_log_snr = max_log_snr

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "transformers_version": "4.54.0"
+}

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee10c0dc3b60f8f4ea60be06c590920a7b85078a250511aac0db4c184c4fcf0
+size 4946119120

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd8257df5e1bbcd0dc099389a5f80f58a4c854baa669bac3cd4799f75714253a
+size 4970074808

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c50fd4558188506075bb3b1e1732c6c4fbff651d80fa57e853bb1a5ffbf37b98
+size 4885038200

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22ba8c7f034ceece1884afee15a202e03ecbe1194540b72d0cd81fbfb959c2b2
+size 4946128048

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,487 @@

+{
+  "metadata": {
+    "total_parameters": 9873022464,
+    "total_size": 19747307520
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.attn_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.attn_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.attn_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.attn_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.self_attn.k_bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.k_bias": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.k_bias": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.3.attn_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.k_bias": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.k_bias": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.attn_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.k_bias": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.attn_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.attn_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.attn_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.attn_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}

modeling_gidd.py ADDED Viewed

	@@ -0,0 +1,1134 @@

+import typing as tp
+import warnings
+from functools import partial
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn.attention.flex_attention import flex_attention
+from transformers import PreTrainedModel
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation.utils import GenerationMixin
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from .configuration_gidd import GiddConfig
+@dataclass
+class AttentionLayerOutput:
+    hidden_states: torch.Tensor
+    attentions: tp.Optional[torch.Tensor] = None
+    past_key_values: tp.Optional[tp.List[tp.Tuple[torch.Tensor, torch.Tensor]]] = None
+@dataclass
+class DecoderLayerOutput:
+    hidden_states: torch.Tensor
+    attentions: tp.Optional[torch.Tensor] = None
+    past_key_values: tp.Optional[tp.List[tp.Tuple[torch.Tensor, torch.Tensor]]] = None
+def promote_dtype(args: tuple, *, dtype: torch.dtype | None = None) -> tuple:
+    return tuple(
+        torch.as_tensor(x, dtype=dtype) if x is not None else None
+        for x in args
+    )
+class ScaledLinear(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        *,
+        scale: float | tp.Literal["fan_in", "fan_out"] = 1.0,
+        use_bias: bool = True,
+        dtype: torch.dtype | None = None,
+    ):
+        super().__init__()
+        if scale == "fan_in":
+            scale = in_features**-0.5
+        elif scale == "fan_out":
+            scale = out_features**-0.5
+        if scale != 1.0:
+            def _scale_operator(x):
+                return x * scale
+        else:
+            def _scale_operator(x):
+                return x
+        self._scale_operator = _scale_operator
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = use_bias
+        weight_shape = (out_features, in_features)
+        weight = torch.zeros(weight_shape, dtype=dtype)
+        self.weight = nn.Parameter(weight)
+        if use_bias:
+            bias = torch.zeros((out_features,), dtype=dtype)
+            self.bias = nn.Parameter(bias)
+        else:
+            self.bias = None
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        w: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        dtype = inputs.dtype
+        weight = self.weight if w is None else w
+        bias = self.bias if self.use_bias else None
+        if bias is not None:
+            inputs, weight, bias = promote_dtype((inputs, weight, bias), dtype=dtype)
+        else:
+            inputs, weight = promote_dtype((inputs, weight), dtype=dtype)
+        y = torch.matmul(
+            inputs,
+            weight.T,
+        )
+        y = self._scale_operator(y)
+        if bias is not None:
+            y = y + bias.reshape((1,) * (y.ndim - 1) + (-1,))
+        return y
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    cos = cos.unsqueeze(2).to(dtype=x.dtype)
+    sin = sin.unsqueeze(2).to(dtype=x.dtype)
+    assert sin.ndim == x.ndim
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).reshape(x.shape)
+def apply_basic_rope(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    positions: torch.Tensor,
+    frequencies: torch.Tensor,
+    rotary_dim: int,
+    is_neox_style: bool,
+    offsets: torch.Tensor | None = None,
+    dtype: torch.dtype = torch.float32,
+):
+    if offsets is not None:
+        positions = positions + offsets
+    cos, sin = torch.chunk(frequencies[positions], 2, dim=-1)
+    if rotary_dim != query.shape[-1]:
+        query_rot = _apply_rotary_emb(query[..., :rotary_dim], cos, sin, is_neox_style)
+        query = torch.cat((query_rot, query[..., rotary_dim:]), dim=-1)
+        key_rot = _apply_rotary_emb(key[..., :rotary_dim], cos, sin, is_neox_style)
+        key = torch.cat((key_rot, key[..., rotary_dim:]), dim=-1)
+        return query.to(dtype), key.to(dtype), cos, sin
+    else:
+        query = _apply_rotary_emb(query, cos, sin, is_neox_style)
+        key = _apply_rotary_emb(key, cos, sin, is_neox_style)
+        return query.to(dtype), key.to(dtype), cos, sin
+def compute_basic_frequencies(
+    base: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+):
+    inv = 1.0 / torch.pow(
+        base,
+        torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim,
+    )
+    freqs = torch.einsum(
+        "i,j->ij",
+        torch.arange(max_position_embeddings, dtype=torch.float32),
+        inv,
+    )
+    freqs = torch.cat([freqs.cos(), freqs.sin()], dim=-1)
+    return freqs
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: torch.Tensor | None = None,
+        frequencies: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if frequencies is None:
+            frequencies = compute_basic_frequencies(
+                base=self.base,
+                rotary_dim=self.rotary_dim,
+                max_position_embeddings=self.max_position_embeddings,
+            )
+        if hasattr(frequencies, "value"):
+            frequencies = frequencies.value
+        return apply_basic_rope(
+            query=query,
+            key=key,
+            positions=positions,
+            frequencies=frequencies,
+            rotary_dim=self.rotary_dim,
+            is_neox_style=self.is_neox_style,
+            offsets=offsets,
+            dtype=self.dtype,
+        )
+class GiddRMSNorm(nn.Module):
+    def __init__(
+        self,
+        config: GiddConfig,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.config = config
+        self.epsilon = self.config.rms_norm_eps
+        self.weight = nn.Parameter(torch.zeros(self.config.hidden_size, dtype=dtype))
+        # self.bias = nn.Parameter(torch.zeros(self.config.hidden_size, dtype=dtype))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32)
+        variance = variance.pow(2.0)
+        variance = variance.mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon)
+        hidden_states = ((1 + self.weight) * hidden_states)
+        return hidden_states.to(dtype)
+ALL_LAYERNORM_LAYERS.append(GiddRMSNorm)
+class GiddMLP(nn.Module):
+    def __init__(
+        self,
+        config: GiddConfig,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.config = config
+        self.dtype = dtype
+        linear_class = partial(
+            ScaledLinear,
+            scale=config.weight_scaling,
+            dtype=dtype,
+            use_bias=self.config.mlp_bias,
+        )
+        self.up_proj = linear_class(config.hidden_size, config.intermediate_size)
+        self.down_proj = linear_class(config.intermediate_size, config.hidden_size)
+    def forward(self, h: torch.Tensor) -> torch.Tensor:
+        h = self.up_proj(h)
+        h = torch.relu(h) ** 2
+        h = self.down_proj(h)
+        return h
+class FlexSoftcapAttention(nn.Module):
+    def __init__(self, head_dim, n_heads, softmax_scale, soft_cap):
+        super().__init__()
+        self.d_model = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.scale = float(softmax_scale)
+        self.soft_cap = float(soft_cap)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ):
+        B, _, L = q.shape[:3]
+        def score_mod(score, b, h, q_idx, kv_idx):
+            soft_cap = self.soft_cap
+            score = soft_cap * torch.tanh(score / soft_cap)
+            keep = attention_mask[b, q_idx, kv_idx]
+            return torch.where(keep, score, torch.finfo(score.dtype).min)
+        out = flex_attention(
+            q,
+            k,
+            v,
+            score_mod=score_mod,
+            scale=self.scale,
+        )
+        out = out.transpose(1, 2).contiguous().view(B, L, self.d_model)
+        return out, None
+class VanillaSoftcapAttention(nn.Module):
+    def __init__(self, head_dim, n_heads, softmax_scale, soft_cap):
+        super().__init__()
+        self.d_model = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.scale = float(softmax_scale)
+        self.soft_cap = float(soft_cap)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ):
+        B, _, L = q.shape[:3]
+        scores = torch.einsum(
+            "bhqd,bhkd->bhqk",
+            q * self.scale,
+            k,
+        )
+        scores = self.soft_cap * torch.tanh(scores / self.soft_cap)
+        if attention_mask is not None:
+            scores = scores.masked_fill(~attention_mask.unsqueeze(1), torch.finfo(scores.dtype).min)
+        probs = torch.softmax(scores.to(torch.float32), dim=-1).to(scores.dtype)
+        out = torch.einsum(
+            "bhqk,bhkd->bhqd",
+            probs,
+            v,
+        )
+        out = out.transpose(1, 2).contiguous().view(B, L, self.d_model)
+        return out, probs
+class GiddAttention(nn.Module):
+    def __init__(
+        self,
+        config: GiddConfig,
+        layer_idx: int,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", head_dim)
+        self.num_attention_heads = self.hidden_size // self.head_dim
+        self.is_causal = config.is_causal
+        self.layer_idx = layer_idx
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            self.q_norm = GiddRMSNorm(config, dtype=torch.float32)
+            self.k_norm = GiddRMSNorm(config, dtype=torch.float32)
+        else:
+            self.q_norm = None
+            self.k_norm = None
+        self.attention_bias = config.attention_bias
+        if self.attention_bias:
+            self.k_bias = nn.Parameter(
+                torch.zeros((self.num_attention_heads, self.head_dim), dtype=dtype),
+            )
+            self.v_bias = nn.Parameter(
+                torch.zeros((self.num_attention_heads, self.head_dim), dtype=dtype),
+            )
+        else:
+            self.k_bias = None
+            self.v_bias = None
+        linear_class = partial(
+            ScaledLinear,
+            scale=config.weight_scaling,
+            dtype=dtype,
+            use_bias=False,
+        )
+        self.q_proj = linear_class(
+            self.hidden_size,
+            self.num_attention_heads * self.head_dim,
+        )
+        self.k_proj = linear_class(
+            self.hidden_size,
+            self.num_attention_heads * self.head_dim,
+        )
+        self.v_proj = linear_class(
+            self.hidden_size,
+            self.num_attention_heads * self.head_dim,
+        )
+        self.o_proj = linear_class(
+            self.num_attention_heads * self.head_dim,
+            self.hidden_size,
+        )
+        self.rotary = RotaryEmbedding(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=config.rope_theta,
+            is_neox_style=True,
+            dtype=dtype,
+        )
+        if config.attn_performer == "flex":
+            self.attention_performer = FlexSoftcapAttention(
+                head_dim=self.head_dim,
+                n_heads=self.num_attention_heads,
+                softmax_scale=self.head_dim**-0.5,
+                soft_cap=config.attn_soft_cap,
+            )
+        elif config.attn_performer == "eager":
+            self.attention_performer = VanillaSoftcapAttention(
+                head_dim=self.head_dim,
+                n_heads=self.num_attention_heads,
+                softmax_scale=self.head_dim**-0.5,
+                soft_cap=config.attn_soft_cap,
+            )
+        else:
+            raise ValueError(f"Unknown attn_performer: {config.attn_performer}")
+    def concatenate(
+        self,
+        *,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: torch.Tensor,
+        past_key_values: tp.Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ):
+        assert query.shape[1] == key.shape[1], "Query and Key lengths must match for GIDD attention."
+        if attention_mask is not None:
+            if attention_mask.dtype != torch.bool:
+                warnings.warn("attention_mask should be a boolean array", stacklevel=1)
+                attention_mask = (attention_mask == 1)
+        batch_size = query.shape[0]
+        # shape of attention_mask: (batch_size, seq_len)
+        # or (batch_size, query_len, kv_len)
+        if attention_mask.ndim == 2:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.expand(-1, query.shape[1], -1)
+        elif attention_mask.ndim == 3:
+            # already in correct shape
+            pass
+        if self.attention_bias:
+            ones = torch.ones(
+                attention_mask.shape[:2] + (1,),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device,
+            )
+            attention_mask = torch.cat(
+                [
+                    ones,
+                    attention_mask,
+                ],
+                dim=-1,
+            )
+        if past_key_values is not None:
+            past_keys, past_values = past_key_values
+            key = torch.cat([past_keys, key], dim=1)
+            value = torch.cat([past_values, value], dim=1)
+        elif self.attention_bias:
+            n_heads = self.num_attention_heads
+            bias_shape = (batch_size, 1, n_heads, self.head_dim)
+            k_bias = self.k_bias.view(1, 1, n_heads, self.head_dim).expand(bias_shape)
+            v_bias = self.v_bias.view(1, 1, n_heads, self.head_dim).expand(bias_shape)
+            key = torch.cat([k_bias, key], dim=1)
+            value = torch.cat([v_bias, value], dim=1)
+        # shape of attention_mask: (batch_size, 1, query_len, kv_len + 1)
+        return query, key, value, attention_mask, (key, value)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: tp.Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        frequencies: tp.Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> AttentionLayerOutput:
+        batch_size, sequence_length = hidden_states.shape[:2]
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        if self.use_qk_norm:
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+        qshape = (
+            batch_size,
+            sequence_length,
+            self.num_attention_heads,
+            self.head_dim,
+        )
+        kv_shape = (
+            batch_size,
+            sequence_length,
+            self.num_attention_heads,
+            self.head_dim,
+        )
+        query_states = query_states.view(qshape)
+        key_states = key_states.view(kv_shape)
+        value_states = value_states.view(kv_shape)
+        query_states, key_states, cos, sin = self.rotary(
+            positions=position_ids,
+            query=query_states,
+            key=key_states,
+            frequencies=frequencies,
+        )
+        (
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            past_key_values,
+        ) = self.concatenate(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+        )
+        attention_out, attentions = self.attention_performer.forward(
+            q=query_states.transpose(1, 2),
+            k=key_states.transpose(1, 2),
+            v=value_states.transpose(1, 2),
+            attention_mask=attention_mask,
+        )
+        attn_output = self.o_proj(attention_out)
+        return AttentionLayerOutput(
+            hidden_states=attn_output,
+            attentions=attentions if output_attentions else None,
+            past_key_values=past_key_values,
+        )
+class GiddLayer(nn.Module):
+    def __init__(
+        self,
+        config: GiddConfig,
+        layer_idx: int,
+        dtype=torch.float32,
+        resid_scale: float = 1.0,
+    ):
+        super().__init__()
+        self.config = config
+        self.resid_scale = resid_scale
+        self.layer_idx = layer_idx
+        self.self_attn = GiddAttention(
+            layer_idx=layer_idx,
+            config=config,
+            dtype=dtype,
+        )
+        self.mlp = GiddMLP(
+            config=config,
+            dtype=dtype,
+        )
+        self.attn_layernorm = GiddRMSNorm(
+            config=config,
+            dtype=torch.float32,
+        )
+        self.mlp_layernorm = GiddRMSNorm(
+            config=config,
+            dtype=torch.float32,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: tp.Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        frequencies: tp.Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> DecoderLayerOutput:
+        attn_inputs = self.attn_layernorm(hidden_states)
+        attn_outputs = self.self_attn(
+            attn_inputs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            frequencies=frequencies,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.resid_scale * attn_outputs.hidden_states
+        mlp_inputs = self.mlp_layernorm(hidden_states)
+        mlp_output = self.mlp(mlp_inputs)
+        hidden_states = hidden_states + self.resid_scale * mlp_output
+        return DecoderLayerOutput(
+            hidden_states=hidden_states,
+            attentions=attn_outputs.attentions,
+            past_key_values=attn_outputs.past_key_values,
+        )
+class GiddPreTrainedModel(PreTrainedModel):
+    config_class = GiddConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["GiddLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_flex_attn = False
+    _can_compile_fullgraph = False
+    _supports_attention_backend = False
+    _can_record_outputs = {
+        "hidden_states": GiddLayer,
+        "attentions": GiddAttention,
+    }
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+class GiddModel(GiddPreTrainedModel):
+    def __init__(
+        self,
+        config: GiddConfig,
+    ):
+        super().__init__(config=config)
+        self.resid_scale = config.resid_scale / config.num_hidden_layers
+        dtype = config.torch_dtype
+        self.embed_tokens = nn.Embedding(
+            num_embeddings=self.config.vocab_size,
+            embedding_dim=self.config.hidden_size,
+        )
+        self.embed_tokens.weight.data = self.embed_tokens.weight.data.to(dtype)
+        nn.init.normal_(self.embed_tokens.weight, mean=0.0, std=self.config.emb_init_scale)
+        freqs = compute_basic_frequencies(
+            base=config.rope_theta,
+            rotary_dim=config.hidden_size // config.num_attention_heads,
+            max_position_embeddings=config.max_position_embeddings,
+        )
+        self.frequencies = nn.Buffer(freqs, persistent=False)
+        self.layers = nn.ModuleList(
+            [
+                GiddLayer(
+                    config=config,
+                    layer_idx=i,
+                    resid_scale=self.resid_scale,
+                    dtype=dtype,
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+        self.norm = GiddRMSNorm(
+            config=config,
+            dtype=torch.float32,
+        )
+    def forward(
+        self,
+        input_ids: tp.Optional[torch.Tensor] = None,
+        inputs_embeds: tp.Optional[torch.Tensor] = None,
+        attention_mask: tp.Optional[torch.Tensor] = None,
+        position_ids: tp.Optional[torch.Tensor] = None,
+        past_key_values: tp.Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        cache_position: tp.Optional[torch.LongTensor] = None,
+        output_attentions: tp.Optional[bool] = None,
+        output_hidden_states: tp.Optional[bool] = None,
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids.to(torch.long))
+        if use_cache and past_key_values is None:
+            past_key_values = [None] * self.config.num_hidden_layers
+        elif past_key_values is not None:
+            past_key_values = list(past_key_values)
+        if position_ids is None:
+            past_seen_tokens = 0
+            if past_key_values is not None and any(past_key_values):
+                past_seen_tokens = [kv[0].shape[1] for kv in past_key_values if kv is not None][0]
+            cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = cache_position.unsqueeze(0)
+        batch_size, sequence_length, _ = inputs_embeds.shape
+        assert sequence_length <= self.config.max_position_embeddings, (
+            f"Maximum Position Embedding Reached ! (expected <= {self.config.max_position_embeddings} got {sequence_length})"
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, sequence_length),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+        else:
+            if attention_mask.dtype != torch.bool:
+                attention_mask = (attention_mask == 1)
+        if position_ids is None:
+            position_ids = torch.arange(
+                inputs_embeds.shape[-2],
+                dtype=torch.int32,
+                device=inputs_embeds.device,
+            )
+            position_ids = position_ids.unsqueeze(0).expand(inputs_embeds.shape[:-1])
+        hidden_states = inputs_embeds
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for idx, block in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = block(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                frequencies=self.frequencies,
+                past_key_values=past_key_values[idx] if past_key_values is not None else None,
+            )
+            hidden_states = layer_outputs.hidden_states
+            if output_attentions:
+                all_attentions += (layer_outputs.attentions,)
+            if use_cache:
+                past_key_values[idx] = layer_outputs.past_key_values
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            past_key_values=past_key_values,
+        )
+class GiddForDiffusionLM(GiddPreTrainedModel, GenerationMixin):
+    def __init__(
+        self,
+        config: GiddConfig,
+    ):
+        super().__init__(config=config)
+        self.model = GiddModel(config=config)
+        self.lm_head = ScaledLinear(
+            config.hidden_size,
+            config.vocab_size,
+            scale=config.head_scaling,
+            dtype=config.torch_dtype,
+            use_bias=False,
+        )
+    def forward(
+        self,
+        input_ids: tp.Optional[torch.Tensor] = None,
+        inputs_embeds: tp.Optional[torch.Tensor] = None,
+        attention_mask: tp.Optional[torch.Tensor] = None,
+        position_ids: tp.Optional[torch.Tensor] = None,
+        past_key_values: tp.Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        output_attentions: tp.Optional[bool] = None,
+        output_hidden_states: tp.Optional[bool] = None,
+    ) -> CausalLMOutputWithPast:
+        outputs = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+        )
+        hidden_states = outputs.last_hidden_state
+        if self.config.tie_word_embeddings:
+            logits = hidden_states @ self.model.embed_tokens.weight.t()
+        else:
+            logits = self.lm_head(hidden_states)
+        return CausalLMOutputWithPast(
+            loss=None,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            past_key_values=outputs.past_key_values,
+        )
+    def _sample_prior(self, shape: tuple[int, ...], device: torch.device, mask_token_id: int = 3) -> torch.Tensor:
+        p_unif = torch.sigmoid(
+            torch.ones(shape, device=device) * self.config.min_log_snr + self.config.noise_type
+        )
+        r = torch.rand(shape, device=device)
+        unif = torch.randint(0, self.config.vocab_size, shape, device=device)
+        samples = torch.where(r < p_unif, unif, mask_token_id)
+        return samples
+    def _probs_with_topk_topp(self, logits, temperature: float, top_p: float | None, top_k: int | None):
+        if temperature == 0.0:
+            probs = torch.zeros_like(logits)
+            indices = torch.argmax(logits, dim=-1, keepdim=True)
+            probs.scatter_(-1, indices, 1.0)
+            return probs
+        x = logits / temperature
+        if top_k is not None and 0 < top_k < x.size(-1):
+            kth = torch.topk(x, top_k, dim=-1).values[..., -1, None]
+            x = torch.where(x < kth, torch.full_like(x, float("-inf")), x)
+        if top_p is not None and 0.0 < top_p < 1.0:
+            sorted_logits, sorted_idx = torch.sort(x, descending=True, dim=-1)
+            sorted_probs = torch.softmax(sorted_logits, dim=-1)
+            cumprobs = sorted_probs.cumsum(dim=-1)
+            remove = cumprobs > top_p
+            remove[..., 1:] = remove[..., :-1].clone()
+            remove[..., 0] = False
+            sorted_logits = sorted_logits.masked_fill(remove, float("-inf"))
+            x = x.scatter(-1, sorted_idx, sorted_logits)
+        probs = torch.softmax(x, dim=-1)
+        return probs
+    def _pi_lambda(self, log_snr, mask_token_id=3):
+        unif_vec = torch.ones((self.config.vocab_size,), device=log_snr.device) / (self.config.vocab_size - 1)
+        unif_vec[mask_token_id] = 0.0
+        alpha = torch.sigmoid(log_snr + self.config.noise_type)
+        pi = alpha * unif_vec
+        pi[..., mask_token_id] = 1.0 - alpha
+        return pi
+    def _sample_ancestral(
+        self,
+        z: torch.Tensor,
+        x_hat: torch.Tensor,
+        log_snr_t: torch.Tensor,
+        log_snr_s: torch.Tensor,
+        mask_token_id: int = 3,
+    ):
+        alpha_s = log_snr_s.sigmoid()
+        alpha_t = log_snr_t.sigmoid()
+        beta_s, beta_t = 1.0 - alpha_s, 1.0 - alpha_t
+        alpha_t_s = alpha_t / alpha_s
+        pi_s = self._pi_lambda(log_snr_s, mask_token_id=mask_token_id)
+        pi_t = self._pi_lambda(log_snr_t, mask_token_id=mask_token_id)
+        beta_pi_t_s = beta_t * pi_t - alpha_t_s * beta_s * pi_s
+        # beta_pi_t_s_at_z = beta_pi_t_s[z]
+        q_t = alpha_t * x_hat + beta_t * pi_t[None, None, :]
+        q_s = alpha_s * x_hat + beta_s * pi_s[None, None, :]
+        q_t_at_z = q_t.gather(-1, z.unsqueeze(-1)).squeeze(-1)
+        z_vec = torch.nn.functional.one_hot(z, num_classes=self.config.vocab_size).to(q_t.dtype)
+        q_t_s_at_z = alpha_t_s * z_vec + beta_pi_t_s[z, None]
+        p_s_t = q_s * q_t_s_at_z / q_t_at_z[..., None]
+        z_next = torch.multinomial(p_s_t.flatten(0, 1), num_samples=1).view_as(z)
+        return z_next
+    def _sample_adaptive(
+        self,
+        z: torch.Tensor,
+        logits: torch.Tensor,
+        log_snr: torch.Tensor,
+        n_tokens: int = 1,
+        mask_token_id: int = 3,
+        temperature: float = 0.0,
+        top_p: float | None = None,
+        top_k: int | None = None,
+    ):
+        pi_vec = self._pi_lambda(log_snr, mask_token_id=mask_token_id)
+        p_noise = pi_vec[z]
+        p_noise = p_noise / p_noise.sum(dim=-1, keepdim=True)
+        x_hat = logits.softmax(dim=-1)
+        p_max = x_hat.max(dim=-1).values
+        p_curr = x_hat.gather(-1, z.unsqueeze(-1)).squeeze(-1)
+        p_delta = (p_max - p_curr) * p_noise
+        next_poss = torch.topk(p_delta, n_tokens, dim=-1).indices
+        probs = self._probs_with_topk_topp(
+            logits=logits,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+        )
+        next_tokens = torch.multinomial(probs.flatten(0, 1), num_samples=1).view_as(z)
+        z_next = z.clone()
+        batch_indices = torch.arange(z.shape[0], device=z.device).unsqueeze(-1)
+        z_next[batch_indices, next_poss] = next_tokens[batch_indices, next_poss]
+        return z_next
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: tp.Optional[torch.Tensor] = None,
+        max_length: int = 2048,
+        min_length: int = 0,
+        temperature: float = 1.0,
+        block_length: int = 128,
+        steps: int = 128,
+        top_p: tp.Optional[float] = None,
+        top_k: tp.Optional[int] = None,
+        bos_token_id: int = 0,
+        eos_token_id: int = 1,
+        pad_token_id: int = 2,
+        mask_token_id: int = 3,
+        sampling_method: tp.Literal["ancestral", "adaptive"] = "ancestral",
+        noise_schedule: tp.Literal["linear", "cosine"] | tp.Callable[[torch.Tensor], torch.Tensor] = "cosine",
+        tokens_per_step: int = 1,
+        show_progress: bool = False,
+    ):
+        r"""
+        Generates tokens with block-wise denoising diffusion.
+        Parameters:
+            inputs (`torch.Tensor`):
+                The token sequence used as a prompt for the generation.
+            temperature (`float`, *optional*, defaults to 0.0):
+                The value used to module the next token probabilities. A value of 0.0 corresponds to greedy decoding.
+            block_length (`int`, *optional*, defaults to 32):
+                The size of each generation block. The model generates text in parallel within these blocks. This is a
+                key parameter for controlling the granularity of the generation process.
+            steps (`int`, *optional*, defaults to 32):
+                The number of denoising steps to perform for each block.
+            max_length (`int`, *optional*, defaults to 2048):
+                The maximum length of the sequence to be generated.
+            min_length (`int`, *optional*, defaults to 0):
+                The minimum length of the sequence to be generated.
+            top_p (`float`, *optional*):
+                If set to a float value between 0 and 1, only the most probable tokens with probabilities that add up to
+                `top_p` or higher are kept for generation (nucleus sampling).
+            top_k (`int`, *optional*):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            bos_token_id (`int`, *optional*, defaults to 0):
+                The token ID for the beginning-of-sequence token.
+            eos_token_id (`int`, *optional*, defaults to 1):
+                The token ID for the end-of-sequence token.
+            pad_token_id (`int`, *optional*, defaults to 2):
+                The token ID for the padding token.
+            mask_token_id (`int`, *optional*, defaults to 3):
+                The token ID used as a placeholder for tokens that are yet to be generated.
+        Return:
+            `torch.Tensor`: A string containing the generated token IDs, starting
+            after the prompt and stopping at the first `eos_id` or `gen_length`.
+        """
+        if sampling_method not in ["ancestral", "adaptive"]:
+            raise ValueError(f"Unsupported sampling method: {sampling_method}")
+        if noise_schedule not in ["linear", "cosine"] and not callable(noise_schedule):
+            raise ValueError("noise_schedule must be 'linear', 'cosine', or a callable function.")
+        if inputs is None:
+            inputs = torch.tensor([[bos_token_id]], device=self.device, dtype=torch.long)
+            batch_size = 1
+            prompt_length = 0
+        else:
+            batch_size = inputs.shape[0]
+            prompt_length = inputs.shape[1]
+            if eos_token_id in inputs:
+                warnings.warn("Input prompt contains eos_token_id. Generation may stop earlier than expected.", stacklevel=1)
+            input_ids = inputs.to(self.device)
+        total_length = self.config.max_position_embeddings
+        if noise_schedule == "linear":
+            noise_schedule_fn = lambda t: 1.0 - t
+        elif noise_schedule == "cosine":
+            noise_schedule_fn = lambda t: 0.5 + 0.5 * torch.cos(t * torch.pi)
+        else:
+            noise_schedule_fn = noise_schedule
+        x_prior = self._sample_prior(
+            shape=(batch_size, total_length),
+            device=self.device,
+            mask_token_id=mask_token_id,
+        )
+        x = x_prior.clone()
+        if prompt_length > 0:
+            x[:, :prompt_length] = input_ids.clone()
+        position_ids = torch.arange(total_length, device=self.device)
+        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+        noise_mask = torch.ones_like(x, dtype=torch.bool)
+        noise_mask[:, :prompt_length] = False
+        min_log_snr = torch.tensor(self.config.min_log_snr, device=self.device)
+        max_log_snr = torch.tensor(self.config.max_log_snr, device=self.device)
+        alpha_min = torch.sigmoid(min_log_snr)
+        alpha_max = torch.sigmoid(max_log_snr)
+        ts = torch.linspace(0.0, 1.0, steps=steps + 1, device=self.device)
+        alpha_t = (alpha_max - alpha_min) * noise_schedule_fn(ts) + alpha_min
+        log_snrs = torch.log(alpha_t / (1.0 - alpha_t)).clip(min_log_snr, max_log_snr)
+        if show_progress:
+            import tqdm.auto as tqdm
+            est_num_blocks = (max_length + block_length - 1) // block_length
+            est_num_steps = est_num_blocks * steps
+            pbar = tqdm.tqdm(total=est_num_steps)
+            update_pbar = lambda n: pbar.update(n)
+            def stop_pbar():
+                pbar.total = pbar.n
+                pbar.refresh()
+            close_pbar = lambda: pbar.close()
+        else:
+            update_pbar = lambda n: None
+            stop_pbar = lambda: None
+            close_pbar = lambda: None
+        try:
+            num_blocks = 0
+            while True:
+                current_window_start = prompt_length + num_blocks * block_length
+                current_window_end = current_window_start + block_length
+                attn_mask = (noise_mask[..., :, None] >= noise_mask[..., None, :])
+                keep_logits = False
+                past_key_values = None
+                for step in range(steps, 0, -1):
+                    if past_key_values is None:
+                        output = self.forward(
+                            input_ids=x[:, :current_window_start],
+                            attention_mask=attn_mask[:, :current_window_start, :current_window_start],
+                            position_ids=position_ids[:, :current_window_start],
+                            use_cache=True,
+                        )
+                        past_key_values = output.past_key_values
+                    if not keep_logits:
+                        logits = self.forward(
+                            input_ids=x[:, current_window_start:],
+                            attention_mask=attn_mask[:, current_window_start:],
+                            position_ids=position_ids[:, current_window_start:],
+                            past_key_values=past_key_values,
+                        ).logits
+                        active_logits = logits[:, :block_length, :]
+                        # logits = self.forward(
+                        #     input_ids=x,
+                        #     attention_mask=attn_mask,
+                        #     position_ids=position_ids,
+                        #     past_key_values=None
+                        # ).logits
+                        # active_logits = logits[:, current_window_start:current_window_end, :]
+                        active_logits[..., mask_token_id] = float("-inf")
+                        min_eos_idx = max(0, min_length + prompt_length - current_window_start)
+                        active_logits[:, :min_eos_idx, eos_token_id] = float("-inf")
+                    z_t = x[:, current_window_start:current_window_end]
+                    if sampling_method == "ancestral":
+                        x_hat = self._probs_with_topk_topp(
+                            active_logits.to(torch.float32),
+                            temperature=temperature,
+                            top_k=top_k,
+                            top_p=top_p,
+                        )
+                        z_s = self._sample_ancestral(
+                            z=z_t,
+                            x_hat=x_hat,
+                            log_snr_t=log_snrs[step],
+                            log_snr_s=log_snrs[step - 1],
+                            mask_token_id=mask_token_id,
+                        )
+                    elif sampling_method == "adaptive":
+                        z_s = self._sample_adaptive(
+                            z=z_t,
+                            logits=active_logits.to(torch.float32),
+                            log_snr=log_snrs[step],
+                            n_tokens=tokens_per_step,
+                            mask_token_id=mask_token_id,
+                            temperature=temperature,
+                            top_p=top_p,
+                            top_k=top_k,
+                        )
+                    keep_logits = (z_s == z_t).all().item()
+                    x[:, current_window_start:current_window_end] = z_s.clone()
+                    update_pbar(1)
+                num_blocks += 1
+                noise_mask[:, :current_window_end] = False
+                has_eos = (x == eos_token_id).any(-1).all().item()
+                all_done = current_window_end >= max_length + prompt_length or has_eos
+                if all_done:
+                    stop_pbar()
+                    break
+        finally:
+            close_pbar()
+        generated_answer = x[:, :max_length + prompt_length]
+        eos_idx = (generated_answer == eos_token_id).int().argmax(dim=-1)
+        for i, idx in enumerate(eos_idx):
+            if idx > 0:
+                generated_answer[i, idx:] = pad_token_id
+        return generated_answer

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<|mask|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,140 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|mask|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "5": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|end_of_text|>",
+  "extra_special_tokens": {},
+  "mask_token": "<|mask|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}