|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
|
|
|
from transformers import PreTrainedModel, PretrainedConfig
|
|
|
|
|
|
|
|
|
class PatentClassifierConfig(PretrainedConfig):
|
|
|
model_type = "patent_classifier"
|
|
|
|
|
|
def __init__(self,
|
|
|
model_name="Qwen/Qwen3-0.6B",
|
|
|
hidden_dims=[512, 256],
|
|
|
output_dim=9,
|
|
|
dropout_rate=0.1,
|
|
|
max_length=256,** kwargs):
|
|
|
super().__init__(**kwargs)
|
|
|
self.model_name = model_name
|
|
|
self.hidden_dims = hidden_dims
|
|
|
self.output_dim = output_dim
|
|
|
self.dropout_rate = dropout_rate
|
|
|
self.max_length = max_length
|
|
|
|
|
|
|
|
|
class PatentClassifier(PreTrainedModel):
|
|
|
config_class = PatentClassifierConfig
|
|
|
|
|
|
def __init__(self, config):
|
|
|
super().__init__(config)
|
|
|
self.config = config
|
|
|
|
|
|
if "qwen" in config.model_name.lower():
|
|
|
self.base_llm_model = AutoModelForCausalLM.from_pretrained(
|
|
|
config.model_name,
|
|
|
trust_remote_code=True
|
|
|
)
|
|
|
else:
|
|
|
self.base_llm_model = AutoModel.from_pretrained(config.model_name)
|
|
|
|
|
|
for param in self.base_llm_model.parameters():
|
|
|
param.requires_grad = False
|
|
|
|
|
|
|
|
|
self.hidden_size = self.base_llm_model.config.hidden_size
|
|
|
layers = []
|
|
|
input_dim = self.hidden_size
|
|
|
for dim in config.hidden_dims:
|
|
|
layers.append(nn.Linear(input_dim, dim))
|
|
|
layers.append(nn.ReLU())
|
|
|
layers.append(nn.Dropout(config.dropout_rate))
|
|
|
input_dim = dim
|
|
|
layers.append(nn.Linear(input_dim, config.output_dim))
|
|
|
self.classifier = nn.Sequential(*layers)
|
|
|
|
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
|
|
|
|
|
|
def forward(self, input_ids, attention_mask):
|
|
|
with torch.no_grad():
|
|
|
outputs = self.base_llm_model(
|
|
|
input_ids=input_ids,
|
|
|
attention_mask=attention_mask,
|
|
|
output_hidden_states=True
|
|
|
)
|
|
|
last_hidden_state = outputs.hidden_states[-1]
|
|
|
attention_mask = attention_mask.unsqueeze(-1)
|
|
|
weighted_hidden = last_hidden_state * attention_mask
|
|
|
cls_embedding = weighted_hidden.sum(dim=1) / attention_mask.sum(dim=1).clamp(min=1e-9)
|
|
|
return self.classifier(cls_embedding)
|
|
|
|
|
|
def tokenize(self, texts, max_length=None):
|
|
|
max_length = max_length or self.config.max_length
|
|
|
return self.tokenizer(
|
|
|
texts,
|
|
|
max_length=max_length,
|
|
|
padding="max_length",
|
|
|
truncation=True,
|
|
|
return_tensors="pt"
|
|
|
) |