728x90
학습을 위해 데이터를 분리하여주겠습니다.
from sklearn.model_selection import train_test_split
# train 데이터에서 10% 만큼을 validation 데이터로 분리
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
random_state=222, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
random_state=222, test_size=0.1)
main.py
import pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
def file_open(filePath):
f =open(filePath,"rb")
data = pickle.load(f)
f.close()
return data
if __name__ == "__main__":
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokened_data = file_open('data/token_data.pkl')
tokenized_texts = [token_label_pair[0] for token_label_pair in tokened_data]
labels = [token_label_pair[1] for token_label_pair in tokened_data]
## padding
# print(np.quantile(np.array([len(x) for x in tokenized_texts]), 0.975)) # 문장의 길이가 상위 2.5%(88) 인 지점
max_len = 88
bs = 32
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
maxlen=max_len, dtype = "int", value=tokenizer.convert_tokens_to_ids("[PAD]"), truncating="post", padding="post")
label_dict = {'PER_B': 0, 'DAT_B': 1, '-': 2, 'ORG_B': 3, 'CVL_B': 4, 'NUM_B': 5, 'LOC_B': 6, 'EVT_B': 7, 'TRM_B': 8, 'TRM_I': 9, 'EVT_I': 10, 'PER_I': 11, 'CVL_I': 12, 'NUM_I': 13, 'TIM_B': 14, 'TIM_I': 15, 'ORG_I': 16, 'DAT_I': 17, 'ANM_B': 18, 'MAT_B': 19, 'MAT_I': 20, 'AFW_B': 21, 'FLD_B': 22, 'LOC_I': 23, 'AFW_I': 24, 'PLT_B': 25, 'FLD_I': 26, 'ANM_I': 27, 'PLT_I': 28, '[PAD]': 29}
tags = pad_sequences([lab for lab in labels], maxlen=max_len, value=label_dict["[PAD]"], padding='post',\
dtype='int', truncating='post')
# Attention mask
attention_masks = np.array([[int(i != tokenizer.convert_tokens_to_ids("[PAD]")) for i in ii] for ii in input_ids])
# train 데이터에서 10% 만큼을 validation 데이터로 분리
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
random_state=222, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
random_state=222, test_size=0.1)
728x90
'Project > 캡스톤디자인2' 카테고리의 다른 글
[NLP Project] Bert model 성능 기록 (0) | 2022.11.14 |
---|---|
[NLP Project] Bert 모델에 NER 학습시키기 (텐서플로우) - keras.saving() 해결일지 (0) | 2022.11.11 |
[NLP Project] Padding , Attention mask (1) | 2022.11.11 |
[NLP Project] BERT 인풋 만들기 - 1. 문장 전처리 (3) | 2022.11.09 |
[NLP Project] 1-2. 데이터 전처리 - 특수문자 제거하기 (0) | 2022.11.09 |