[NLP Project] tensorflow 파인튜닝한 모델 저장하고 모델 불러오기

sillon 2022. 11. 15. 00:53

728x90

텐서플로우에서 파인튜닝한 버트 모델을 저장하고 불러오겠습니다.

    model.fit(
    X_train, y_train, epochs=1, batch_size=128,
    callbacks = [f1_score_report]
)   

    model.save_weights('save_model/model_weight')

학습한 모델은 save_weights 로 저장합니다.

predict.py

def model_load():
    model = modeling(model_name='bert-base-multilingual-cased',
                                  tag_size=30)
    model.load_weights('save_model/model_weight')

    print("model_load성공!!")
    return model

모델의 아키텍처를 모두 불러오고 컴파일 한 뒤에 모델의 가중치를 로드해줍니다.

predict.py

from konlpy.tag import Mecab
import glob, re
import numpy as np
import codecs
import os
import json
from tqdm import tqdm
import tensorflow as tf
from seqeval.metrics import f1_score, classification_report
from transformers import shape_list, BertTokenizer, TFBertModel
from tensorflow import keras
from transformers import *

from silence_tensorflow import silence_tensorflow

silence_tensorflow()


mecab = Mecab()
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

label_dict = {'PER_B': 0, 'DAT_B': 1, '-': 2, 'ORG_B': 3, 'CVL_B': 4, 'NUM_B': 5, 'LOC_B': 6, 'EVT_B': 7, 'TRM_B': 8, 'TRM_I': 9, 'EVT_I': 10, 'PER_I': 11, 'CVL_I': 12, 'NUM_I': 13, 'TIM_B': 14, 'TIM_I': 15, 'ORG_I': 16, 'DAT_I': 17, 'ANM_B': 18, 'MAT_B': 19, 'MAT_I': 20, 'AFW_B': 21, 'FLD_B': 22, 'LOC_I': 23, 'AFW_I': 24, 'PLT_B': 25, 'FLD_I': 26, 'ANM_I': 27, 'PLT_I': 28, '[PAD]': 29}
index_to_tag = {v:k for k,v in label_dict.items()} # 키-값 쌍 변경, 인덱스(키)로 태그(값) 찾기



class TFBertForTokenClassification(tf.keras.Model):
    def __init__(self, model_name, num_labels):
        super(TFBertForTokenClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(num_labels,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        all_output = outputs[0]
        prediction = self.classifier(all_output)

        return prediction


def modeling(model_name, tag_size):
    model = TFBertForTokenClassification(model_name, tag_size)
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=5e-5)
    model.compile(optimizer=optimizer, loss=compute_loss)
    return model

def compute_loss(labels, logits):

  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
  active_loss = tf.reshape(labels, (-1,)) != -100
  reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
  labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)

  return loss_fn(labels, reduced_logits)

def convert_examples_to_features_for_prediction(examples, max_seq_len, tokenizer,
                                 pad_token_id_for_segment=0, pad_token_id_for_label=-100):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []

    for example in tqdm(examples):
        tokens = []
        label_mask = []
        for one_word in example:
            subword_tokens = tokenizer.tokenize(one_word)
            tokens.extend(subword_tokens)
            label_mask.extend([0]+ [pad_token_id_for_label] * (len(subword_tokens) - 1))

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            label_mask = label_mask[:(max_seq_len - special_tokens_count)]

        tokens += [sep_token]
        label_mask += [pad_token_id_for_label]

        tokens = [cls_token] + tokens
        label_mask = [pad_token_id_for_label] + label_mask


        input_id = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        padding_count = max_seq_len - len(input_id)
        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)
        token_type_id = [pad_token_id_for_segment] * max_seq_len
        label_mask = label_mask + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
        assert len(label_mask) == max_seq_len, "Error with labels length {} vs {}".format(len(label_mask), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        label_masks.append(label_mask)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    label_masks = np.asarray(label_masks, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), label_masks

    
def ner_prediction(examples, max_seq_len, tokenizer, lang='ko'):
  
  if lang == 'ko':
    examples = [mecab.morphs(sent) for sent in examples]
  else:
    examples = [sent.split() for sent in examples]

  X_pred, label_masks = convert_examples_to_features_for_prediction(examples, max_seq_len=88, tokenizer=tokenizer)
  model = model_load()
  y_predicted = model.predict(X_pred)
  y_predicted = np.argmax(y_predicted, axis = 2)

  pred_list = []
  result_list = []

  for i in range(0, len(label_masks)):
    pred_tag = []
    for label_index, pred_index in zip(label_masks[i], y_predicted[i]):
      if label_index != -100:
        pred_tag.append(index_to_tag[pred_index])

    pred_list.append(pred_tag)

  for example, pred in zip(examples, pred_list):
    one_sample_result = []
    for one_word, label_token in zip(example, pred):
      one_sample_result.append((one_word, label_token))
    result_list.append(one_sample_result)

  return result_list

def model_load():
    model = modeling(model_name='bert-base-multilingual-cased',
                                  tag_size=30)
    model.load_weights('save_model/model_weight')

    print("model_load성공!!")
    return model

if __name__ == "__main__":
    sent1 = '오리온스는 리그 최정상급 포인트가드 김동훈을 앞세우는 빠른 공수전환이 돋보이는 팀이다'
    sent2 = '이 다음에는 koBert를 fineturning할거야!'
    sent3 = '강수연의 생일은 12월 6일이다.'

    test_samples = [sent1, sent2, sent3]
    
    result_list = ner_prediction(test_samples, max_seq_len=88, tokenizer=tokenizer, lang='ko')
    print(result_list)

모델이 잘 불러와지는지 연습하느라 500문장만 학습한 파일을 임의로 불러왔습니다.

model_load성공!!
2022-11-15 00:52:03.527352: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-11-15 00:52:03.544508: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3699850000 Hz
[[('오리온스', '-'), ('는', '[PAD]'), ('리그', '[PAD]'), ('최', '[PAD]'), ('정상급', '-'), ('포인트', '-'), ('가드', '-'), ('김동훈', '[PAD]'), ('을', '-'), ('앞세우', '-'), ('는', '[PAD]'), ('빠른', '-'), ('공수', '-'), ('전환', '-'), ('이', '[PAD]'), ('돋보이', '-'), ('는', '[PAD]'), ('팀', '-'), ('이', '[PAD]'), ('다', '[PAD]')], [('이', '[PAD]'), ('다음', '-'), ('에', '[PAD]'), ('는', '[PAD]'), ('koBert', '[PAD]'), ('를', '[PAD]'), ('fineturning', '[PAD]'), ('할', '-'), ('거', '-'), ('야', '[PAD]'), ('!', '[PAD]')], [('강수연', '[PAD]'), ('의', '-'), ('생일', '-'), ('은', '[PAD]'), ('12', '[PAD]'), ('월', '-'), ('6', '[PAD]'), ('일', '[PAD]'), ('이', '[PAD]'), ('다', '[PAD]'), ('.', '-')]]

성공적으로 작업되었습니다!

(다시 여태 가중치를 저장하는 방식으로 안해서 다시 학습해서 모델 저장해야함...ㅠㅠ)

728x90