from transformers import ( GPT2Config, GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling ) from pathlib import Path # === Параметры === model_name = "NekitAI" data_path = "my_texts.txt" block_size = 128 batch_size = 4 epochs = 3 # === Токенизатор === tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token # обязательно для обучения # === Конфигурация модели === config = GPT2Config( vocab_size=tokenizer.vocab_size, n_positions=block_size, n_embd=256, n_layer=4, n_head=4, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id ) # === Создание модели === model = GPT2LMHeadModel(config) # === Подготовка датасета === dataset = TextDataset( tokenizer=tokenizer, file_path=data_path, block_size=block_size ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # === Аргументы обучения === training_args = TrainingArguments( output_dir=model_name, overwrite_output_dir=True, per_device_train_batch_size=batch_size, num_train_epochs=epochs, save_steps=500, logging_steps=50, save_total_limit=1, prediction_loss_only=True, fp16=True, # включай, если у тебя есть GPU с поддержкой fp16 ) # === Trainer === trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) # === Обучение === trainer.train() # === Сохранение модели и токенизатора === Path(model_name).mkdir(parents=True, exist_ok=True) model.save_pretrained(model_name) tokenizer.save_pretrained(model_name) print(f"\n✅ Готово! Модель сохранена в: {model_name}")