{ "architectures": [ "PerceptionLMForConditionalGeneration" ], "image_token_id": 128002, "model_type": "perception_lm", "projector_pooling_ratio": 2, "text_config": { "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 128000, "eos_token_id": [ 128001, 128009 ], "head_dim": 128, "hidden_act": "silu", "hidden_size": 3072, "initializer_range": 0.02, "intermediate_size": 8192, "max_position_embeddings": 11520, "mlp_bias": false, "model_type": "llama", "num_attention_heads": 24, "num_hidden_layers": 28, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": { "factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3" }, "rope_theta": 500000.0, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "use_cache": true, "vocab_size": 128256 }, "torch_dtype": "bfloat16", "transformers_version": "4.54.0.dev0", "video_token_id": 128003, "vision_config": { "architecture": "vit_pe_core_large_patch14_336", "do_pooling": true, "global_pool": "map", "initializer_range": 0.02, "label_names": [ "LABEL_0", "LABEL_1" ], "model_args": { "depth": 23, "embed_dim": 1024, "global_pool": "", "img_size": [ 448, 448 ], "init_values": 0.1, "ref_feat_shape": [ 32, 32 ], "use_post_transformer_norm": false }, "model_type": "timm_wrapper", "num_classes": 2, "num_features": 1024, "pretrained_cfg": { "classifier": "head", "crop_mode": "center", "crop_pct": 1.0, "custom_load": false, "first_conv": "patch_embed.proj", "fixed_input_size": true, "input_size": [ 3, 336, 336 ], "interpolation": "bicubic", "license": "custom", "mean": [ 0.5, 0.5, 0.5 ], "pool_size": null, "std": [ 0.5, 0.5, 0.5 ], "tag": "fb" }, "torch_dtype": "bfloat16" }, "vision_use_cls_token": true }