무회blog

python: pytorch,Bert, 로 토큰나이징 하기 본문

Python/TA

python: pytorch,Bert, 로 토큰나이징 하기

최무회 2020. 8. 13. 18:15

 

 

 

In [1]:
import torch
from transformers import AutoModel,AutoTokenizer, BertTokenizer

print(torch.__version__)
torch.set_grad_enabled(False)
 
1.6.0
Out[1]:
<torch.autograd.grad_mode.set_grad_enabled at 0x23cb8c21048>
In [2]:
# 모델 저장하기 , Store the model we want to use
MODEL_NAME = "bert-base-cased"

# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print('model : ',model)
 
model :  BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (2): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (3): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (4): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (5): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (6): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (7): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (8): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (9): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (10): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (11): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
)
In [3]:
# 문장 자르기, 뛰어쓰기 기준? 
tokens = tokenizer.tokenize("This is an input example")
print("Tokens: {}".format(tokens))


# 토큰에 정수 숫자 id 정해주기 , not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# 충분히 필요한 만큼의 토큰 id 를 추가해주기 , Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)
print('tokens_ids:           ' ,tokens_ids, type(tokens_ids))


# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
# 토큰 id를 기준으로 타입변환을 시켜주기 , list -> tensor 
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt), type(tokens_pt))

# Now we're ready to go through BERT with out input

outputs, pooled = model(tokens_pt)
print(type(outputs))
print("顺滑的Token wise output: {}, 公用资源Pooled output: {}".format(outputs.shape, pooled.shape))
print('')

print(len(outputs))
print(len(outputs[0]))
print(len(outputs[0][0]))
 
Tokens: ['This', 'is', 'an', 'input', 'example']
Tokens id: [1188, 1110, 1126, 7758, 1859]
tokens_ids:            [101, 1188, 1110, 1126, 7758, 1859, 102] <class 'list'>
Tokens PyTorch: tensor([[ 101, 1188, 1110, 1126, 7758, 1859,  102]]) <class 'torch.Tensor'>
<class 'torch.Tensor'>
顺滑的Token wise output: torch.Size([1, 7, 768]), 公用资源Pooled output: torch.Size([1, 768])

1
7
768
In [4]:
### transformer 시작 
In [5]:
# tokens = tokenizer.tokenize("This is an input example")
# tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
# tokens_pt = torch.tensor([tokens_ids])

# This code can be factored into one-line as follow
tokens_pt2 = tokenizer("This is an input example", return_tensors="pt")

for key, value in tokens_pt2.items():
    print("{}:\n\t{}".format(key, value))

outputs2, pooled2 = model(**tokens_pt2)
print("Difference with previous code: ({}, {})".format((outputs2 - outputs).sum(), (pooled2 - pooled).sum()))
 
input_ids:
	tensor([[ 101, 1188, 1110, 1126, 7758, 1859,  102]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1]])
Difference with previous code: (0.0, 0.0)
In [6]:
# token_type_ids: This tensor will map every tokens to their corresponding segment (see below).
# attention_mask: This tensor is used to "mask" padded values in a batch of sequence with different lengths (see below).

# Single segment input
single_seg_input = tokenizer("This is a sample input")

print("Single segment token (str): {}".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))
print("Single segment token (int): {}".format(single_seg_input['input_ids']))
print("Single segment type       : {}".format(single_seg_input['token_type_ids']))


# Multiple segment input
multi_seg_input = tokenizer("This is segment A", "This is segment B")
# Segments are concatened in the input to the model, with 
print()
print("Multi segment token (str): {}".format(tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))
print("Multi segment token (int): {}".format(multi_seg_input['input_ids']))
print("Multi segment type       : {}".format(multi_seg_input['token_type_ids']))
 
Single segment token (str): ['[CLS]', 'This', 'is', 'a', 'sample', 'input', '[SEP]']
Single segment token (int): [101, 1188, 1110, 170, 6876, 7758, 102]
Single segment type       : [0, 0, 0, 0, 0, 0, 0]

Multi segment token (str): ['[CLS]', 'This', 'is', 'segment', 'A', '[SEP]', 'This', 'is', 'segment', 'B', '[SEP]']
Multi segment token (int): [101, 1188, 1110, 6441, 138, 102, 1188, 1110, 6441, 139, 102]
Multi segment type       : [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
In [7]:
# Padding highlight
tokens = tokenizer(
    ["This is a sample", "This is another longer sample text"], 
    padding=True  # First sentence will have some PADDED tokens to match second sequence length
)

for i in range(2):
    print("Tokens (int)      : {}".format(tokens['input_ids'][i]))
    print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][i]]))
    print("Tokens (attn_mask): {}".format(tokens['attention_mask'][i]))
    print()
 
Tokens (int)      : [101, 1188, 1110, 170, 6876, 102, 0, 0]
Tokens (str)      : ['[CLS]', 'This', 'is', 'a', 'sample', '[SEP]', '[PAD]', '[PAD]']
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 0, 0]

Tokens (int)      : [101, 1188, 1110, 1330, 2039, 6876, 3087, 102]
Tokens (str)      : ['[CLS]', 'This', 'is', 'another', 'longer', 'sample', 'text', '[SEP]']
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1]

In [8]:
# from transformers import BertTokenizer, BertModel
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
# model = BertModel.from_pretrained("bert-base-multilingual-uncased")
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)
In [10]:
# # 텐서플로우 혹은 파이토츠리로 버트 모델 불러오기 
# # Let's load a BERT model for TensorFlow and PyTorch
from transformers import TFBertModel, BertModel
model_tf = TFBertModel.from_pretrained('bert-base-uncased')
model_pt = BertModel.from_pretrained('bert-base-uncased')
 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.
In [11]:
# transformers generates a ready to use dictionary with all the required parameters for the specific framework.
input_tf = tokenizer("This is a sample input", return_tensors="tf")
input_pt = tokenizer("This is a sample input", return_tensors="pt")

# Let's compare the outputs
output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)

# Models outputs 2 values (The value for each tokens, the pooled representation of the input sentence)
# Here we compare the output differences between PyTorch and TensorFlow.
for name, o_tf, o_pt in zip(["output", "pooled"], output_tf, output_pt):
    print("{} differences: {:.5}".format(name, (o_tf.numpy() - o_pt.numpy()).sum()))
 
output differences: 1.7832e-05
pooled differences: 7.1824e-06
In [13]:
from transformers import DistilBertModel
bert_distil = DistilBertModel.from_pretrained('distilbert-base-uncased')
input_pt = tokenizer(
    'This is a sample input to demonstrate performance of distiled models especially inference time', 
    return_tensors="pt"
)


%time _ = bert_distil(input_pt['input_ids'])
%time _ = model_pt(input_pt['input_ids'])
 
 
 
 
 
 
Wall time: 20 ms
Wall time: 37 ms
In [31]:
from transformers import TFBertModel,BertModel,DistilBertModel,AutoModel
from transformers import AutoTokenizer, BertTokenizer
# Let's load 한국어 BERT from 멀티 라이브러리 
de_bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
de_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

de_input = de_tokenizer(
    "오늘따라 날씨도 안좋게 비가 오지 않고 있어서 버트 테스트를 해보고 있다.그래서 기분이 좋았다. 이유는 날씨가  좋지 않아서다",
    return_tensors="pt"
)
print("Tokens (int)      : {}".format(de_input['input_ids'].tolist()[0]))
print("Tokens (str)      : {}".format([de_tokenizer.convert_ids_to_tokens(s) for s in de_input['input_ids'].tolist()[0]]))
print("Tokens (attn_mask): {}".format(de_input['attention_mask'].tolist()[0]))
print()

output_de, pooled_de = de_bert(**de_input)

print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))
 
Tokens (int)      : [101, 1174, 29347, 97071, 63277, 97073, 89474, 76818, 47928, 12265, 1174, 26646, 20766, 97109, 15051, 1170, 32035, 11376, 1174, 77072, 44853, 86008, 1170, 33645, 15037, 1179, 40389, 36979, 11643, 46957, 27884, 12300, 11724, 119, 13988, 37051, 12516, 1163, 32035, 85693, 1175, 29347, 97109, 29597, 119, 12398, 42159, 11192, 76818, 47928, 11376, 1175, 29347, 97109, 12799, 1174, 25539, 91011, 12516, 12261, 102]
Tokens (str)      : ['[CLS]', 'ᄋ', '##ᅩ', '##ᄂ', '##ᅳᆯ', '##ᄄ', '##ᅡ라', '날', '##씨', '##도', 'ᄋ', '##ᅡᆫ', '##조', '##ᇂ', '##게', 'ᄇ', '##ᅵ', '##가', 'ᄋ', '##ᅩ지', '않고', '있어서', 'ᄇ', '##ᅥ', '##트', 'ᄐ', '##ᅦ', '##스트', '##를', '해', '##보', '##고', '있다', '.', '그', '##래', '##서', 'ᄀ', '##ᅵ', '##분이', 'ᄌ', '##ᅩ', '##ᇂ', '##았다', '.', '이', '##유', '##는', '날', '##씨', '##가', 'ᄌ', '##ᅩ', '##ᇂ', '##지', 'ᄋ', '##ᅡ', '##ᆭ아', '##서', '##다', '[SEP]']
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Token wise output: torch.Size([1, 7, 768]), Pooled output: torch.Size([1, 768])
Comments