
Pipeline
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
classifier = pipeline('zero-shot-classification')
generator = pipeline("text-generation")
unmasker = pipeline("fill-mask")
ner = pipeline("ner", grouped_entities=True)
question_answer = pipeline("question-answering")
summarizer = pipeline("summarization")
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
classifier("I've been waiting for a HuggingFace course my whole life.")
classifier("This is a course about the Transforemrs library",
candidate_labels=["education", "politics", "business"])
generator("In this course, we will teach you how to ")
list_ = ["In this course, we will teach you how to ", "This is a course about the Transforemrs library"]
for sentence in list_:
print(generator(sentence, num_return_sequences=1, max_length=50))
generator = pipeline("text-generation", model="huggingtweets/dril")
unmasker("This course will teach you all about <mask> models", top_k=2)
ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
question_answer(question="Where do I work?",
context="My name is Sylvain and I work at Hugging Face in Brooklyn.")
summarizer(
'''
Michael Joseph Jackson (August 29, 1958 – June 25, 2009) was an American singer,
songwriter, dancer, and philanthropist. Known as the "King of Pop", he is regarded
as one of the most significant cultural figures of the 20th century. During his
four-decade career, his contributions to music, dance, and fashion, along with his
publicized personal life, made him a global figure in popular culture. Jackson influenced
artists across many music genres; through stage and video performances, he popularized
complicated dance moves such as the moonwalk, to which he gave the name, as well as the robot.
'''
)
translator("Ce cours est produit par HuggingFace")
Training
import torch
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
raw_inputs = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this too much!"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
outputs.last_hidden_state.shape
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.config.id2label
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
model.save_pretrained('./test')
Bert
from transformers import BertConfig, BertModel
config = BertConfig()
config.hidden_size = 48
model = BertModel(config)
정리
- dataloader -> model -> optimizer -> loss -> training
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequence = "I've been waiting for a HuggingFace course my while life."
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
inputs_ids = torch.tensor([ids])
print("inputs_ids : ", inputs_ids)
output = model(inputs_ids)
print("output : ", output.logits)
batched_ids = [
[200,200,200],
[200,200,tokenizer.pad_token_id]
]
attention_mask = [
[1,1,1],
[1,1,0]
]
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)
dataloader 사용 편하게!
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm import tqdm
from datasets import load_metric
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
for batch in train_dataloader:
break
{k: v.shape for k, v in batch.items()}
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
print(num_training_steps)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
metric = load_metric("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()