在哪里找个人做网站的,网站建设与管理学的是什么,如何做内网站的宣传栏,wordpress手机qq登录地址文章目录 一、改写文本分类1.导入相关包2.加载数据集3.划分数据集4.数据集预处理5.创建模型6.创建评估函数7.创建 TrainingArguments8.创建 Trainer9.模型训练10.模型评估11.模型预测 二、交互/单塔模式1.导入相关包2.加载数据集3.划分数据集4.数据集预处理5.创建模型#xff… 文章目录 一、改写文本分类1.导入相关包2.加载数据集3.划分数据集4.数据集预处理5.创建模型6.创建评估函数7.创建 TrainingArguments8.创建 Trainer9.模型训练10.模型评估11.模型预测 二、交互/单塔模式1.导入相关包2.加载数据集3.划分数据集4.数据集预处理5.创建模型区别6.创建评估函数区别7.创建 TrainingArguments8.创建 Trainer9.模型训练10.模型评估11.模型预测区别 !pip install transformers datasets evaluate accelerate 一、改写文本分类 1.导入相关包
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset2.加载数据集
dataset load_dataset(json, data_files./train_pair_1w.json, splittrain)
datasetDataset({features: [sentence1, sentence2, label],num_rows: 10000
})dataset[:3]{sentence1: [找一部小时候的动画片,我不可能是一个有鉴赏能力的行家小姐我把我的时间都花在书写上象这样豪华的舞会我还是头一次见到。,胡子长得太快怎么办],sentence2: [求一部小时候的动画片。谢了, 蜡烛没熄就好了夜黑得瘆人情绪压抑。, 胡子长得快怎么办],label: [1, 0, 1]}3.划分数据集
datasets dataset.train_test_split(test_size0.2, seed3407)
datasets[train][sentence1][0]王琦瑶说:你家阿大二十岁已经有儿有女了嘛4.数据集预处理
import torchtokenizer AutoTokenizer.from_pretrained(hfl/chinese-macbert-base)def process_function(examples):tokenized_examples tokenizer(examples[sentence1], examples[sentence2], max_length128, truncationTrue)tokenized_examples[labels] [int(label) for label in examples[label]]return tokenized_examplestokenized_datasets datasets.map(process_function, batchedTrue, remove_columnsdatasets[train].column_names)
tokenized_datasetsDatasetDict({train: Dataset({features: [input_ids, token_type_ids, attention_mask, labels],num_rows: 8000})test: Dataset({features: [input_ids, token_type_ids, attention_mask, labels],num_rows: 2000})
})5.创建模型
from transformers import BertForSequenceClassification
model AutoModelForSequenceClassification.from_pretrained(hfl/chinese-macbert-base, num_labels2)6.创建评估函数
import evaluateacc_metric evaluate.load(accuracy)
f1_metirc evaluate.load(f1)def eval_metric(eval_predict):predictions, labels eval_predictlabels [int(l) for l in labels]predictions predictions.argmax(axis-1)acc acc_metric.compute(predictionspredictions, referenceslabels)f1 f1_metirc.compute(predictionspredictions, referenceslabels)acc.update(f1)return acc7.创建 TrainingArguments
train_args TrainingArguments(output_dir./cross_model, # 输出文件夹per_device_train_batch_size32, # 训练时的batch_sizeper_device_eval_batch_size32, # 验证时的batch_sizelogging_steps10, # log 打印的频率evaluation_strategyepoch, # 评估策略save_strategyepoch, # 保存策略save_total_limit3, # 最大保存数learning_rate2e-5, # 学习率weight_decay0.01, # weight_decaymetric_for_best_modelf1, # 设定评估指标load_best_model_at_endTrue) # 训练完成后加载最优模型
train_argsTrainingArguments(
_n_gpu1,
adafactorFalse,
adam_beta10.9,
adam_beta20.999,
adam_epsilon1e-08,
auto_find_batch_sizeFalse,
bf16False,
bf16_full_evalFalse,
data_seedNone,
dataloader_drop_lastFalse,
dataloader_num_workers0,
dataloader_pin_memoryTrue,
ddp_backendNone,
ddp_broadcast_buffersNone,
ddp_bucket_cap_mbNone,
ddp_find_unused_parametersNone,
ddp_timeout1800,
debug[],
deepspeedNone,
disable_tqdmFalse,
dispatch_batchesNone,
do_evalTrue,
do_predictFalse,
do_trainFalse,
eval_accumulation_stepsNone,
eval_delay0,
eval_stepsNone,
evaluation_strategyepoch,
fp16False,
fp16_backendauto,
fp16_full_evalFalse,
fp16_opt_levelO1,
fsdp[],
fsdp_config{min_num_params: 0, xla: False, xla_fsdp_grad_ckpt: False},
fsdp_min_num_params0,
fsdp_transformer_layer_cls_to_wrapNone,
full_determinismFalse,
gradient_accumulation_steps1,
gradient_checkpointingFalse,
gradient_checkpointing_kwargsNone,
greater_is_betterTrue,
group_by_lengthFalse,
half_precision_backendauto,
hub_always_pushFalse,
hub_model_idNone,
hub_private_repoFalse,
hub_strategyevery_save,
hub_tokenHUB_TOKEN,
ignore_data_skipFalse,
include_inputs_for_metricsFalse,
include_tokens_per_secondFalse,
jit_mode_evalFalse,
label_namesNone,
label_smoothing_factor0.0,
learning_rate2e-05,
length_column_namelength,
load_best_model_at_endTrue,
local_rank0,
log_levelpassive,
log_level_replicawarning,
log_on_each_nodeTrue,
logging_dir./cross_model/runs/Nov27_07-11-23_66feef283143,
logging_first_stepFalse,
logging_nan_inf_filterTrue,
logging_steps10,
logging_strategysteps,
lr_scheduler_typelinear,
max_grad_norm1.0,
max_steps-1,
metric_for_best_modelf1,
mp_parameters,
neftune_noise_alphaNone,
no_cudaFalse,
num_train_epochs3.0,
optimadamw_torch,
optim_argsNone,
output_dir./cross_model,
overwrite_output_dirFalse,
past_index-1,
per_device_eval_batch_size32,
per_device_train_batch_size32,
prediction_loss_onlyFalse,
push_to_hubFalse,
push_to_hub_model_idNone,
push_to_hub_organizationNone,
push_to_hub_tokenPUSH_TO_HUB_TOKEN,
ray_scopelast,
remove_unused_columnsTrue,
report_to[tensorboard],
resume_from_checkpointNone,
run_name./cross_model,
save_on_each_nodeFalse,
save_safetensorsTrue,
save_steps500,
save_strategyepoch,
save_total_limit3,
seed42,
skip_memory_metricsTrue,
split_batchesFalse,
tf32None,
torch_compileFalse,
torch_compile_backendNone,
torch_compile_modeNone,
torchdynamoNone,
tpu_metrics_debugFalse,
tpu_num_coresNone,
use_cpuFalse,
use_ipexFalse,
use_legacy_prediction_loopFalse,
use_mps_deviceFalse,
warmup_ratio0.0,
warmup_steps0,
weight_decay0.01,
)8.创建 Trainer
from transformers import DataCollatorWithPadding
trainer Trainer(modelmodel, argstrain_args, train_datasettokenized_datasets[train], eval_datasettokenized_datasets[test], data_collatorDataCollatorWithPadding(tokenizertokenizer),compute_metricseval_metric)9.模型训练
trainer.train()10.模型评估
trainer.evaluate(tokenized_datasets[test])11.模型预测
from transformers import pipeline, TextClassificationPipelinemodel.config.id2label {0: 不相似, 1: 相似}
pipe pipeline(text-classification, modelmodel, tokenizertokenizer, device0)result pipe({text: 我喜欢北京, text_pair: 天气})
result[label] 相似 if result[score] 0.5 else 不相似
result{label: 不相似, score: 0.8792306780815125}result pipe({text: 我喜欢北京, text_pair: 我喜欢北京})
result{label: 相似, score: 0.9374899864196777}二、交互/单塔模式
label 设为 1代表两个句子的相似度分数通过设置阈值来判断类别对于同一个句子 A存在若干候选句子要找到与句子 A 最相似的某个候选句子上述文本分类处理方式无法解决此处将分类任务作为回归任务阈值的方式进行处理从而能够得到预测的得分该得分可以用来判断 哪个候选句子与给定句子最相似
1.导入相关包
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset2.加载数据集
dataset load_dataset(json, data_files./train_pair_1w.json, splittrain)
datasetDataset({features: [sentence1, sentence2, label],num_rows: 10000
})dataset[:3]{sentence1: [找一部小时候的动画片,我不可能是一个有鉴赏能力的行家小姐我把我的时间都花在书写上象这样豪华的舞会我还是头一次见到。,胡子长得太快怎么办],sentence2: [求一部小时候的动画片。谢了, 蜡烛没熄就好了夜黑得瘆人情绪压抑。, 胡子长得快怎么办],label: [1, 0, 1]}3.划分数据集
注意这里有种子参数
datasets dataset.train_test_split(test_size0.2, seed3407)
datasets[train][sentence1][0]王琦瑶说:你家阿大二十岁已经有儿有女了嘛4.数据集预处理
import torchtokenizer AutoTokenizer.from_pretrained(hfl/chinese-macbert-base)def process_function(examples):tokenized_examples tokenizer(examples[sentence1], examples[sentence2], max_length128, truncationTrue)# 这里float(label)的原因是要做MSE需要float类型的数据tokenized_examples[labels] [float(label) for label in examples[label]]return tokenized_examplestokenized_datasets datasets.map(process_function, batchedTrue, remove_columnsdatasets[train].column_names)tokenized_datasetsDatasetDict({train: Dataset({features: [input_ids, token_type_ids, attention_mask, labels],num_rows: 8000})test: Dataset({features: [input_ids, token_type_ids, attention_mask, labels],num_rows: 2000})
}print(tokenized_datasets[train][0]){input_ids: [101, 4374, 4425, 4457, 6432, 131, 872, 2157, 7350, 1920, 753, 1282, 2259, 2347, 5307, 3300, 1036, 3300, 1957, 749, 1658, 102, 7350, 1920, 1372, 3300, 1036, 2094, 3766, 3300, 1957, 1036, 102],
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
labels: 0.0}5.创建模型区别
from transformers import BertForSequenceClassification model AutoModelForSequenceClassification.from_pretrained(hfl/chinese-macbert-base, num_labels1)6.创建评估函数区别
import evaluateacc_metric evaluate.load(accuracy)
f1_metirc evaluate.load(f1)def eval_metric(eval_predict):predictions, labels eval_predictpredictions [int(p 0.5) for p in predictions]labels [int(l) for l in labels]# predictions predictions.argmax(axis-1)acc acc_metric.compute(predictionspredictions, referenceslabels)f1 f1_metirc.compute(predictionspredictions, referenceslabels)acc.update(f1)return acc7.创建 TrainingArguments
train_args TrainingArguments(output_dir./cross_model, # 输出文件夹per_device_train_batch_size32, # 训练时的batch_sizeper_device_eval_batch_size32, # 验证时的batch_sizelogging_steps10, # log 打印的频率evaluation_strategyepoch, # 评估策略save_strategyepoch, # 保存策略save_total_limit3, # 最大保存数learning_rate2e-5, # 学习率weight_decay0.01, # weight_decaymetric_for_best_modelf1, # 设定评估指标load_best_model_at_endTrue) # 训练完成后加载最优模型train_argsTrainingArguments(
_n_gpu1,
adafactorFalse,
adam_beta10.9,
adam_beta20.999,
adam_epsilon1e-08,
auto_find_batch_sizeFalse,
bf16False,
bf16_full_evalFalse,
data_seedNone,
dataloader_drop_lastFalse,
dataloader_num_workers0,
dataloader_pin_memoryTrue,
ddp_backendNone,
ddp_broadcast_buffersNone,
ddp_bucket_cap_mbNone,
ddp_find_unused_parametersNone,
ddp_timeout1800,
debug[],
deepspeedNone,
disable_tqdmFalse,
dispatch_batchesNone,
do_evalTrue,
do_predictFalse,
do_trainFalse,
eval_accumulation_stepsNone,
eval_delay0,
eval_stepsNone,
evaluation_strategyepoch,
fp16False,
fp16_backendauto,
fp16_full_evalFalse,
fp16_opt_levelO1,
fsdp[],
fsdp_config{min_num_params: 0, xla: False, xla_fsdp_grad_ckpt: False},
fsdp_min_num_params0,
fsdp_transformer_layer_cls_to_wrapNone,
full_determinismFalse,
gradient_accumulation_steps1,
gradient_checkpointingFalse,
gradient_checkpointing_kwargsNone,
greater_is_betterTrue,
group_by_lengthFalse,
half_precision_backendauto,
hub_always_pushFalse,
hub_model_idNone,
hub_private_repoFalse,
hub_strategyevery_save,
hub_tokenHUB_TOKEN,
ignore_data_skipFalse,
include_inputs_for_metricsFalse,
include_tokens_per_secondFalse,
jit_mode_evalFalse,
label_namesNone,
label_smoothing_factor0.0,
learning_rate2e-05,
length_column_namelength,
load_best_model_at_endTrue,
local_rank0,
log_levelpassive,
log_level_replicawarning,
log_on_each_nodeTrue,
logging_dir./cross_model/runs/Nov27_06-35-36_66feef283143,
logging_first_stepFalse,
logging_nan_inf_filterTrue,
logging_steps10,
logging_strategysteps,
lr_scheduler_typelinear,
max_grad_norm1.0,
max_steps-1,
metric_for_best_modelf1,
mp_parameters,
neftune_noise_alphaNone,
no_cudaFalse,
num_train_epochs3.0,
optimadamw_torch,
optim_argsNone,
output_dir./cross_model,
overwrite_output_dirFalse,
past_index-1,
per_device_eval_batch_size32,
per_device_train_batch_size32,
prediction_loss_onlyFalse,
push_to_hubFalse,
push_to_hub_model_idNone,
push_to_hub_organizationNone,
push_to_hub_tokenPUSH_TO_HUB_TOKEN,
ray_scopelast,
remove_unused_columnsTrue,
report_to[tensorboard],
resume_from_checkpointNone,
run_name./cross_model,
save_on_each_nodeFalse,
save_safetensorsTrue,
save_steps500,
save_strategyepoch,
save_total_limit3,
seed42,
skip_memory_metricsTrue,
split_batchesFalse,
tf32None,
torch_compileFalse,
torch_compile_backendNone,
torch_compile_modeNone,
torchdynamoNone,
tpu_metrics_debugFalse,
tpu_num_coresNone,
use_cpuFalse,
use_ipexFalse,
use_legacy_prediction_loopFalse,
use_mps_deviceFalse,
warmup_ratio0.0,
warmup_steps0,
weight_decay0.01,
)8.创建 Trainer
from transformers import DataCollatorWithPadding
trainer Trainer(modelmodel, argstrain_args, train_datasettokenized_datasets[train], eval_datasettokenized_datasets[test], data_collatorDataCollatorWithPadding(tokenizertokenizer),compute_metricseval_metric)9.模型训练
trainer.train()TrainOutput(global_step750, training_loss0.09012470634778341, metrics{train_runtime: 558.2367, train_samples_per_second: 42.993, train_steps_per_second: 1.344, total_flos: 1552456398705984.0, train_loss: 0.09012470634778341, epoch: 3.0})10.模型评估
trainer.evaluate(tokenized_datasets[test]){eval_loss: 0.06814368069171906,eval_accuracy: 0.9095,eval_f1: 0.8840486867392696,eval_runtime: 14.6336,eval_samples_per_second: 136.672,eval_steps_per_second: 4.305,epoch: 3.0}11.模型预测区别
from transformers import pipeline, TextClassificationPipelinemodel.config.id2label {0: 不相似, 1: 相似}pipe pipeline(text-classification, modelmodel, tokenizertokenizer, device0)# function_to_applynone 应对softmax / sigmoid处理
result pipe({text: 我喜欢北京, text_pair: 天气怎样}, function_to_applynone)
result[label] 相似 if result[score] 0.5 else 不相似result{label: 不相似, score: -0.025799434632062912}