NLTK — Natural Language Toolkit

NLTK (Natural Language Toolkit) is Python’s foundational NLP library. It provides essential tools for text processing — tokenization, stemming, POS tagging, parsing, and access to over 50 corpora. While newer libraries like spaCy and Hugging Face handle production workloads faster, NLTK remains the best starting point for learning NLP concepts.

Installation and Setup

pip install nltk

Download the corpora and models you need:

import nltk

# Download everything (large, ~3GB total)
# nltk.download('all')

# Or download only what you need
nltk.download('punkt_tab')            # Tokenizer
nltk.download('averaged_perceptron_tagger_eng')  # POS tagger
nltk.download('wordnet')              # Lemmatizer
nltk.download('stopwords')            # Stopword lists
nltk.download('vader_lexicon')        # Sentiment analyzer
nltk.download('maxent_ne_chunker_tab') # NER chunker
nltk.download('words')                # English word corpus

Tokenization

from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer

text = "NLP has evolved dramatically in 2025! Models like Claude and GPT-4 demonstrate remarkable language understanding."

# Word tokens
words = word_tokenize(text)
print("Words:", words[:8])
# ['NLP', 'has', 'evolved', 'dramatically', 'in', '2025', '!', 'Models']

# Sentence tokens
sentences = sent_tokenize(text)
print("Sentences:", sentences)

# Tweet tokenizer (handles @mentions, #hashtags, emoji)
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tweet = "@user This new LLM is sooooo impressive!!! #NLP #AI"
print("Tweet tokens:", tweet_tokenizer.tokenize(tweet))
# ['This', 'new', 'LLM', 'is', 'sooo', 'impressive', '!', '!', '!', '#NLP', '#AI']

Stemming

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

words = ["running", "studies", "happily", "generously", "transformers"]

porter = PorterStemmer()
snowball = SnowballStemmer("english")
lancaster = LancasterStemmer()

print(f"{'Word':<15} {'Porter':<12} {'Snowball':<12} {'Lancaster'}")
print("-" * 55)
for word in words:
    print(f"{word:<15} {porter.stem(word):<12} {snowball.stem(word):<12} {lancaster.stem(word)}")

Lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lem = WordNetLemmatizer()

# Lemmatize with POS for best accuracy
def lemmatize_with_pos(word, pos_tag):
    pos_map = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    wn_pos = pos_map.get(pos_tag[0], wordnet.NOUN)
    return lem.lemmatize(word, pos=wn_pos)

pairs = [("running", "VBG"), ("studies", "NNS"), ("better", "JJR"), ("quickly", "RB")]
for word, tag in pairs:
    print(f"{word:<12} → {lemmatize_with_pos(word, tag)}")
# running      → run
# studies      → study
# better       → good
# quickly      → quickly

POS Tagging

from nltk import pos_tag, word_tokenize

text = "The neural network efficiently processes sequential text data."
tokens = word_tokenize(text)
tagged = pos_tag(tokens)

print(tagged)
# [('The', 'DT'), ('neural', 'JJ'), ('network', 'NN'), ('efficiently', 'RB'), ...]

# Extract specific POS
nouns = [(w, t) for w, t in tagged if t.startswith('NN')]
verbs = [(w, t) for w, t in tagged if t.startswith('VB')]
print("Nouns:", nouns)
print("Verbs:", verbs)

Named Entity Recognition

from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

text = "Anthropic was founded in San Francisco by Dario Amodei in 2021."
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
entities = ne_chunk(tagged)

# Extract named entities
named_entities = []
for chunk in entities:
    if isinstance(chunk, Tree):
        entity = " ".join(w for w, t in chunk)
        named_entities.append((entity, chunk.label()))

print(named_entities)
# [('Anthropic', 'ORGANIZATION'), ('San Francisco', 'GPE'), ('Dario Amodei', 'PERSON')]

Sentiment Analysis (VADER)

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

reviews = [
    "This library is absolutely fantastic for learning NLP concepts!",
    "The documentation is outdated and the API is confusing.",
    "It works, but there are faster alternatives available."
]

for review in reviews:
    scores = sia.polarity_scores(review)
    sentiment = "Positive" if scores['compound'] > 0.05 else "Negative" if scores['compound'] < -0.05 else "Neutral"
    print(f"{sentiment} ({scores['compound']:.3f}): {review[:50]}")

Frequency Distributions

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

text = """
Transformer models have revolutionized natural language processing.
Attention mechanisms allow models to capture long-range dependencies.
BERT, GPT, and Claude are built on transformer architecture.
"""

stop_words = set(stopwords.words('english'))
tokens = [w.lower() for w in word_tokenize(text) if w.isalpha() and w.lower() not in stop_words]

fdist = FreqDist(tokens)
print("Most common words:", fdist.most_common(10))
fdist.plot(10)  # matplotlib frequency chart

When to Use NLTK vs Other Libraries

Task	NLTK	spaCy	Hugging Face
Learning NLP	Best	Good	Overkill for basics
Production pipeline	Slow	Fast	Best for accuracy
Corpus access	50+ corpora	Limited	Large Hub
Custom grammars	Yes (CFG)	No	No
Multilingual	Limited	Good (20+ langs)	Excellent
Sentiment (VADER)	Built-in	No	Requires model

NLTK shines for education, research prototypes, and tasks that need grammar-based analysis. For production NLP in 2025, spaCy handles most preprocessing needs faster, while Hugging Face models provide the highest accuracy.