MOE 模型
Qwen3MoeForCausalLM((model): Qwen3MoeModel((embed_tokens): Embedding(151936, 2048, padding_idx=151643)(layers): ModuleList((0-47): 48 x Qwen3MoeDecoderLayer((self_attn): Qwen3MoeAttention((q_proj): Linear(in_features=2048, out_features=4096, bias=False)(k_proj): Linear(in_features=2048, out_features=512, bias=False)(v_proj): Linear(in_features=2048, out_features=512, bias=False)(o_proj): Linear(in_features=4096, out_features=2048, bias=False)(q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)(k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06))(mlp): Qwen3MoeSparseMoeBlock((gate): Linear(in_features=2048, out_features=128, bias=False)(experts): ModuleList((0-127): 128 x Qwen3MoeMLP((gate_proj): Linear(in_features=2048, out_features=768, bias=False)(up_proj): Linear(in_features=2048, out_features=768, bias=False)(down_proj): Linear(in_features=768, out_features=2048, bias=False)(act_fn): SiLU())))(input_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)(post_attention_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)))(norm): Qwen3MoeRMSNorm((2048,), eps=1e-06)(rotary_emb): Qwen3MoeRotaryEmbedding())(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
)