728x90
텐서플로우에서 파인튜닝한 버트 모델을 저장하고 불러오겠습니다.
model.fit(
X_train, y_train, epochs=1, batch_size=128,
callbacks = [f1_score_report]
)
model.save_weights('save_model/model_weight')
학습한 모델은 save_weights 로 저장합니다.
predict.py
def model_load():
model = modeling(model_name='bert-base-multilingual-cased',
tag_size=30)
model.load_weights('save_model/model_weight')
print("model_load성공!!")
return model
모델의 아키텍처를 모두 불러오고 컴파일 한 뒤에 모델의 가중치를 로드해줍니다.
predict.py
from konlpy.tag import Mecab
import glob, re
import numpy as np
import codecs
import os
import json
from tqdm import tqdm
import tensorflow as tf
from seqeval.metrics import f1_score, classification_report
from transformers import shape_list, BertTokenizer, TFBertModel
from tensorflow import keras
from transformers import *
from silence_tensorflow import silence_tensorflow
silence_tensorflow()
mecab = Mecab()
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
label_dict = {'PER_B': 0, 'DAT_B': 1, '-': 2, 'ORG_B': 3, 'CVL_B': 4, 'NUM_B': 5, 'LOC_B': 6, 'EVT_B': 7, 'TRM_B': 8, 'TRM_I': 9, 'EVT_I': 10, 'PER_I': 11, 'CVL_I': 12, 'NUM_I': 13, 'TIM_B': 14, 'TIM_I': 15, 'ORG_I': 16, 'DAT_I': 17, 'ANM_B': 18, 'MAT_B': 19, 'MAT_I': 20, 'AFW_B': 21, 'FLD_B': 22, 'LOC_I': 23, 'AFW_I': 24, 'PLT_B': 25, 'FLD_I': 26, 'ANM_I': 27, 'PLT_I': 28, '[PAD]': 29}
index_to_tag = {v:k for k,v in label_dict.items()} # 키-값 쌍 변경, 인덱스(키)로 태그(값) 찾기
class TFBertForTokenClassification(tf.keras.Model):
def __init__(self, model_name, num_labels):
super(TFBertForTokenClassification, self).__init__()
self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
self.classifier = tf.keras.layers.Dense(num_labels,
kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
name='classifier')
def call(self, inputs):
input_ids, attention_mask, token_type_ids = inputs
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
all_output = outputs[0]
prediction = self.classifier(all_output)
return prediction
def modeling(model_name, tag_size):
model = TFBertForTokenClassification(model_name, tag_size)
optimizer = tf.keras.optimizers.RMSprop(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=compute_loss)
return model
def compute_loss(labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
active_loss = tf.reshape(labels, (-1,)) != -100
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
return loss_fn(labels, reduced_logits)
def convert_examples_to_features_for_prediction(examples, max_seq_len, tokenizer,
pad_token_id_for_segment=0, pad_token_id_for_label=-100):
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token_id = tokenizer.pad_token_id
input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []
for example in tqdm(examples):
tokens = []
label_mask = []
for one_word in example:
subword_tokens = tokenizer.tokenize(one_word)
tokens.extend(subword_tokens)
label_mask.extend([0]+ [pad_token_id_for_label] * (len(subword_tokens) - 1))
special_tokens_count = 2
if len(tokens) > max_seq_len - special_tokens_count:
tokens = tokens[:(max_seq_len - special_tokens_count)]
label_mask = label_mask[:(max_seq_len - special_tokens_count)]
tokens += [sep_token]
label_mask += [pad_token_id_for_label]
tokens = [cls_token] + tokens
label_mask = [pad_token_id_for_label] + label_mask
input_id = tokenizer.convert_tokens_to_ids(tokens)
attention_mask = [1] * len(input_id)
padding_count = max_seq_len - len(input_id)
input_id = input_id + ([pad_token_id] * padding_count)
attention_mask = attention_mask + ([0] * padding_count)
token_type_id = [pad_token_id_for_segment] * max_seq_len
label_mask = label_mask + ([pad_token_id_for_label] * padding_count)
assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
assert len(label_mask) == max_seq_len, "Error with labels length {} vs {}".format(len(label_mask), max_seq_len)
input_ids.append(input_id)
attention_masks.append(attention_mask)
token_type_ids.append(token_type_id)
label_masks.append(label_mask)
input_ids = np.array(input_ids, dtype=int)
attention_masks = np.array(attention_masks, dtype=int)
token_type_ids = np.array(token_type_ids, dtype=int)
label_masks = np.asarray(label_masks, dtype=np.int32)
return (input_ids, attention_masks, token_type_ids), label_masks
def ner_prediction(examples, max_seq_len, tokenizer, lang='ko'):
if lang == 'ko':
examples = [mecab.morphs(sent) for sent in examples]
else:
examples = [sent.split() for sent in examples]
X_pred, label_masks = convert_examples_to_features_for_prediction(examples, max_seq_len=88, tokenizer=tokenizer)
model = model_load()
y_predicted = model.predict(X_pred)
y_predicted = np.argmax(y_predicted, axis = 2)
pred_list = []
result_list = []
for i in range(0, len(label_masks)):
pred_tag = []
for label_index, pred_index in zip(label_masks[i], y_predicted[i]):
if label_index != -100:
pred_tag.append(index_to_tag[pred_index])
pred_list.append(pred_tag)
for example, pred in zip(examples, pred_list):
one_sample_result = []
for one_word, label_token in zip(example, pred):
one_sample_result.append((one_word, label_token))
result_list.append(one_sample_result)
return result_list
def model_load():
model = modeling(model_name='bert-base-multilingual-cased',
tag_size=30)
model.load_weights('save_model/model_weight')
print("model_load성공!!")
return model
if __name__ == "__main__":
sent1 = '오리온스는 리그 최정상급 포인트가드 김동훈을 앞세우는 빠른 공수전환이 돋보이는 팀이다'
sent2 = '이 다음에는 koBert를 fineturning할거야!'
sent3 = '강수연의 생일은 12월 6일이다.'
test_samples = [sent1, sent2, sent3]
result_list = ner_prediction(test_samples, max_seq_len=88, tokenizer=tokenizer, lang='ko')
print(result_list)
모델이 잘 불러와지는지 연습하느라 500문장만 학습한 파일을 임의로 불러왔습니다.
model_load성공!!
2022-11-15 00:52:03.527352: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-11-15 00:52:03.544508: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3699850000 Hz
[[('오리온스', '-'), ('는', '[PAD]'), ('리그', '[PAD]'), ('최', '[PAD]'), ('정상급', '-'), ('포인트', '-'), ('가드', '-'), ('김동훈', '[PAD]'), ('을', '-'), ('앞세우', '-'), ('는', '[PAD]'), ('빠른', '-'), ('공수', '-'), ('전환', '-'), ('이', '[PAD]'), ('돋보이', '-'), ('는', '[PAD]'), ('팀', '-'), ('이', '[PAD]'), ('다', '[PAD]')], [('이', '[PAD]'), ('다음', '-'), ('에', '[PAD]'), ('는', '[PAD]'), ('koBert', '[PAD]'), ('를', '[PAD]'), ('fineturning', '[PAD]'), ('할', '-'), ('거', '-'), ('야', '[PAD]'), ('!', '[PAD]')], [('강수연', '[PAD]'), ('의', '-'), ('생일', '-'), ('은', '[PAD]'), ('12', '[PAD]'), ('월', '-'), ('6', '[PAD]'), ('일', '[PAD]'), ('이', '[PAD]'), ('다', '[PAD]'), ('.', '-')]]
성공적으로 작업되었습니다!
(다시 여태 가중치를 저장하는 방식으로 안해서 다시 학습해서 모델 저장해야함...ㅠㅠ)
728x90
'Project > 캡스톤디자인2' 카테고리의 다른 글
[NLP Project] BERT NER 개체명 인식기로 퀴즈 알고리즘 만들기 (0) | 2022.11.18 |
---|---|
[NLP Project] 문장에서 불용어 제거하기 (1) | 2022.11.16 |
[NLP Project] Bert model 성능 기록 (0) | 2022.11.14 |
[NLP Project] Bert 모델에 NER 학습시키기 (텐서플로우) - keras.saving() 해결일지 (0) | 2022.11.11 |
[NLP Project] Test data, Train Data Split (0) | 2022.11.11 |