▷ 오늘 학습 계획: GPT 강의(4~7)
자연어 데이터의 불완전성
특정 자연어 Task를 해결하기 위해서는 다양한 label이 요구되고
format도 매우 복잡하다.
Few shot, zero shot
BERT GPT-1 GPT-2 GPT-3 Pre-training O O O O Fine-tunning O O X X Structure Transformer Encoder Transformer Decoder Transformer Decoder Transformer Decoder Attention Multi-head attention Masked Multi-head attention Masked Multi-head attention Masked Multi-head attention Training MLM Next word guess Next word guess Next word guess
pip install transformers
# pip install transformers[sentencepiece]
pip install datasets
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
classifier = pipeline('zero-shot-classification')
classifier(
'this is a course about the transformers library',
candidate_labels=['education', 'politics', 'business']
)
generator = pipeline('text-generation')
generator('In this class, we will learn how to ',
num_return_sequences = 2,
max_length = 30)
unmasker = pipeline('fill-mask')
unmasker('this is a course about the <mask> library', top_k=2)
question_answerer = pipeline('question-answering')
question_answerer(
question = 'Where do I work?',
context = 'My name is Sylvain and I work at Hugging Face in Brooklyn'
)
summarizer = pipeline('summarization')
summarizer() #sentences 입력
translator = pipeline('translation', model=" ")
translator('Ce cours est produit par Hugging Face.')
from transformers import AutoTokenizer
checkpoint = "distilbert-based-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
raw_inputs = [
"I have been waiting for a higgingface course my whole life",
"I hate this so much",
]
inputs = tokenizer(raw_inputs, padding=True,
truncation=True, return_tensors='pt')
inputs['input_ids']
inputs['attention_mask']
from transformers import AutoModel
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
outputs.last_hidden_state.shape
import torch
from transformers import AutoModelForAudioClassification
model = AutoModelForAudioClassification.from_pretrained(checkpoint)
model.config.id2label
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions
from transformers import BertConfig, BertModel
config = BertConfig()
model = BertModel(config)
model
model.save_pretrained('./test')
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
from datasets import load_metric
from tqdm import tqdm
raw_datasets = load_dataset('glue', 'mrpc')
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
raw_datasets['train'].features
def tokenize_function(example):
return tokenizer(example['sentence1'], example['sentence2'], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')
tokenized_datasets['train'].column_names
train_dataloader = DataLoader(
tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets['validation'], batch_size=8, collate_fn = data_collator
)
tokenized_datasets
for batch in train_dataloader:
break
{k: v.shape for k, v in batch.items()}
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
'linear',
optimizer = optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
print(num_training_steps)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs=model(**batch)
loss=outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
metric = load_metric('glue', 'mrpc')
model.eval()
for batch in eval_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions,
references = batch['labels'])
metric.compute()
import transformers
from transformers import BertConfig, BertModel
transformers.__path__
# import pdb
# pdb.set_trace()
config = BertConfig()
model = BertModel(config)
print(config)
from datasets import load_dataset
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-based-cased')
def tokenize_and_split(examples):
return tokenizer(
examples['review'],
truncation=True,
max_length=128,
return_overflowing_tokens=True
)
from transfomers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
example = '''sample'''
tokens = old_tokenizer.tokenize(example)
tokenizer = old_tokenizer.train_new_iterator(training_corpus, 52000)
# 52000 = vocab_size
tokens = tokenizer.tokenize(example)
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))
print(tokenizer.backend_tokenizer.normalizer.normalize_str('Hello, how are u?'))
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str('Hello, how are u?')