Technology  /  NLP

💬 Natural Language Processing 40 guides · updated 2026

From tokenisation and embeddings to transformer-based language understanding — the NLP fundamentals that underpin every modern LLM.

Hugging Face Transformers

Hugging Face Transformers is the dominant library for working with large language models in Python. It provides a unified API for over 400,000 models on the Hugging Face Hub — BERT, GPT-2, LLaMA, Mistral, Falcon, BLOOM, Whisper, and more.


Installation

Terminal window
pip install transformers
pip install torch # or tensorflow / jax
pip install accelerate # for large model loading
pip install datasets # for training data

Pipelines — Fastest Way to Get Results

The pipeline() function wraps a model and tokenizer into a one-line inference call:

from transformers import pipeline
# Sentiment analysis
sentiment = pipeline("sentiment-analysis")
result = sentiment("The new Claude model is impressively capable and fast.")
print(result) # [{'label': 'POSITIVE', 'score': 0.9997}]
# Named entity recognition
ner = pipeline("ner", aggregation_strategy="simple")
entities = ner("Satya Nadella, CEO of Microsoft, spoke at their Redmond campus.")
for ent in entities:
print(f"{ent['word']:<20} {ent['entity_group']:<8} {ent['score']:.3f}")
# Text generation
generator = pipeline("text-generation", model="gpt2")
output = generator("Natural language processing enables", max_length=50, num_return_sequences=2)
for seq in output:
print(seq['generated_text'])
# Question answering
qa = pipeline("question-answering")
result = qa(
question="What year was BERT released?",
context="BERT was introduced by Google researchers in 2018. It uses bidirectional attention."
)
print(result) # {'answer': '2018', 'score': 0.994}
# Zero-shot classification
classifier = pipeline("zero-shot-classification")
text = "The Federal Reserve raised interest rates by 25 basis points."
labels = ["finance", "sports", "technology", "politics"]
result = classifier(text, candidate_labels=labels)
print(dict(zip(result['labels'], [round(s, 3) for s in result['scores']])))

Loading Models and Tokenizers

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
texts = [
"This transformer model achieves excellent results!",
"The training took way too long and results were poor."
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
labels = model.config.id2label
for i, text in enumerate(texts):
pred_label = labels[probs[i].argmax().item()]
confidence = probs[i].max().item()
print(f"[{pred_label} {confidence:.3f}] {text}")

Fine-Tuning for Text Classification

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
# Sample dataset
data = {
"text": [
"Server CPU usage spiked to 98% during peak traffic.",
"The sourdough bread turned out perfectly golden.",
"CUDA out of memory error when training on large batches.",
"Fresh herbs from the garden make pasta taste amazing.",
"The API rate limit was exceeded after 60 requests.",
"Homemade pizza dough needs at least 2 hours to rise."
],
"label": [0, 1, 0, 1, 0, 1] # 0=tech, 1=food
}
dataset = Dataset.from_dict(data)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.train_test_split(test_size=0.2)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
report_to="none"
)
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = np.argmax(logits, axis=-1)
acc = (preds == labels).mean()
return {"accuracy": acc}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
compute_metrics=compute_metrics
)
trainer.train()

Token Classification (NER)

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
ner_pipeline = pipeline(
"ner",
model="dslim/bert-base-NER",
aggregation_strategy="simple",
device=0 if torch.cuda.is_available() else -1
)
text = "In Q2 2025, Nvidia's revenue reached $44.1 billion, driven by data center demand."
results = ner_pipeline(text)
for entity in results:
print(f"{entity['word']:<20} [{entity['entity_group']}] {entity['score']:.3f}")

Working with LLMs for Generation

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_name = "mistralai/Mistral-7B-Instruct-v0.2" # requires HF access token
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto" # automatically distributes across GPUs
)
messages = [
{"role": "user", "content": "Explain BERT's bidirectional attention in 2 sentences."}
]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
input_ids,
max_new_tokens=150,
do_sample=True,
temperature=0.7,
top_p=0.9
)
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
print(response)

Useful Hub Models by Task (2025)

TaskModelNotes
Sentimentdistilbert-sst-2-englishFast, English only
NERdslim/bert-base-NERCoNLL-2003 trained
QAdeepset/roberta-base-squad2Extractive QA
Summarizationfacebook/bart-large-cnnNews summarization
TranslationHelsinki-NLP/opus-mt-*1000+ language pairs
Text classificationFine-tune DistilBERTFastest fine-tuning
Sentence similarityall-MiniLM-L6-v2Via sentence-transformers
Generationmistralai/Mistral-7B-InstructOpen-source, 7B params