Technology  /  NLP

💬 Natural Language Processing 40 guides · updated 2026

From tokenisation and embeddings to transformer-based language understanding — the NLP fundamentals that underpin every modern LLM.

N-gram Program in NLP

An n-gram program generates sequences of n consecutive words (or characters) from text. This tutorial builds progressively from basic generation to a frequency analyzer and language model.


Basic N-gram Generator

from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
def generate_ngrams(text, n):
tokens = word_tokenize(text.lower())
# Remove punctuation
tokens = [t for t in tokens if t.isalpha()]
return list(ngrams(tokens, n))
text = "Large language models transform how developers build NLP applications efficiently."
print("Unigrams (n=1):", generate_ngrams(text, 1)[:5])
print("Bigrams (n=2):", generate_ngrams(text, 2)[:5])
print("Trigrams (n=3):", generate_ngrams(text, 3)[:5])
print("4-grams (n=4):", generate_ngrams(text, 4)[:4])

Character N-grams

def char_ngrams(text, n, word_boundary=True):
if word_boundary:
# Pad each word with spaces to respect word boundaries
words = text.lower().split()
all_ngrams = []
for word in words:
padded = f" {word} "
word_ngrams = [padded[i:i+n] for i in range(len(padded) - n + 1)]
all_ngrams.extend(word_ngrams)
return all_ngrams
else:
text = text.lower().replace(' ', '')
return [text[i:i+n] for i in range(len(text) - n + 1)]
word = "transformer"
print("Char bigrams:", char_ngrams(word, 2, word_boundary=False))
print("Char trigrams:", char_ngrams(word, 3, word_boundary=False))
# Multiple words
text = "nlp model"
print("Word-boundary char-4grams:", char_ngrams(text, 4))

N-gram Frequency Analysis

from nltk.util import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import FreqDist
from collections import Counter
import nltk
def analyze_ngrams(corpus, n=2, top_k=15):
all_ngrams = []
sentences = sent_tokenize(corpus)
for sentence in sentences:
tokens = [t.lower() for t in word_tokenize(sentence) if t.isalpha()]
all_ngrams.extend(ngrams(tokens, n))
freq_dist = Counter(all_ngrams)
return freq_dist.most_common(top_k)
corpus = """
Machine learning models have fundamentally transformed natural language processing.
Language models learn statistical patterns from vast text corpora.
Modern NLP systems use transformer models for language understanding tasks.
Deep learning has improved accuracy across all NLP tasks significantly.
Transformer models use attention mechanisms to process text sequences efficiently.
Language understanding requires both syntactic and semantic knowledge.
"""
print("=== Bigram Frequency Analysis ===")
for bigram, count in analyze_ngrams(corpus, n=2, top_k=10):
print(f" {' '.join(bigram):<30} {count}")
print("\n=== Trigram Frequency Analysis ===")
for trigram, count in analyze_ngrams(corpus, n=3, top_k=8):
print(f" {' '.join(trigram):<40} {count}")

N-gram Language Model

A simple language model that predicts the next word based on the preceding n-1 words:

from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk, random
class NgramLanguageModel:
def __init__(self, n=2):
self.n = n
self.ngram_counts = defaultdict(Counter)
self.vocab = set()
def train(self, corpus):
sentences = sent_tokenize(corpus)
for sentence in sentences:
tokens = ['<s>'] * (self.n - 1) + \
[t.lower() for t in word_tokenize(sentence) if t.isalpha()] + \
['</s>']
self.vocab.update(tokens)
for i in range(len(tokens) - self.n + 1):
context = tuple(tokens[i:i + self.n - 1])
next_word = tokens[i + self.n - 1]
self.ngram_counts[context][next_word] += 1
def probability(self, word, context):
context = tuple(context[-(self.n-1):])
context_count = sum(self.ngram_counts[context].values())
if context_count == 0:
return 0.0
word_count = self.ngram_counts[context][word]
return word_count / context_count
def predict_next(self, context, top_k=5):
context = tuple([w.lower() for w in context[-(self.n-1):]])
if context not in self.ngram_counts:
return []
predictions = self.ngram_counts[context].most_common(top_k)
total = sum(c for _, c in predictions)
return [(word, round(count/total, 4)) for word, count in predictions]
def generate(self, seed_words, max_words=20):
tokens = ['<s>'] * (self.n - 1) + [w.lower() for w in seed_words]
for _ in range(max_words):
context = tuple(tokens[-(self.n-1):])
candidates = self.ngram_counts.get(context, {})
if not candidates or '</s>' in candidates:
break
next_word = max(candidates, key=candidates.get)
if next_word == '</s>':
break
tokens.append(next_word)
return ' '.join(tokens[self.n-1:])
# Train and test
corpus = """
Natural language processing enables machines to understand human language.
Language models learn from text and generate coherent sequences.
Machine learning transforms how computers process and analyze text data.
Deep learning models achieve remarkable accuracy on language tasks.
Modern NLP uses transformer architectures for language generation.
"""
model = NgramLanguageModel(n=3) # Trigram model
model.train(corpus)
# Predict next words
context = ["language", "models"]
predictions = model.predict_next(context, top_k=5)
print(f"Context: '{' '.join(context)}'")
print("Top predictions:")
for word, prob in predictions:
print(f" {word:<20} {prob:.4f}")
# Generate text
generated = model.generate(["natural", "language"])
print(f"\nGenerated: {generated}")

Visualization

import matplotlib.pyplot as plt
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
text = """
Transformers use attention mechanisms that allow models to relate different positions
of a sequence when computing a representation. Attention lets the model focus on
relevant parts of the input when producing an output. Language models trained on
large corpora learn rich linguistic representations.
"""
tokens = [t.lower() for t in word_tokenize(text) if t.isalpha()]
bigrams = Counter(ngrams(tokens, 2))
top_bigrams = bigrams.most_common(10)
labels = [f"{a} {b}" for (a, b), _ in top_bigrams]
counts = [count for _, count in top_bigrams]
plt.figure(figsize=(10, 5))
plt.barh(labels[::-1], counts[::-1])
plt.xlabel('Frequency')
plt.title('Top 10 Bigrams')
plt.tight_layout()
plt.savefig('bigram_frequency.png', dpi=150)
plt.show()
print("Chart saved to bigram_frequency.png")

Practical Applications Summary

ApplicationN-gram TypeHow
Autocomplete / autocorrectWord bigrams/trigramsPredict most likely next word
Spam detectionCharacter 4-6-gramsCatch obfuscation like “V1agra”
Language detectionCharacter bigramsEach language has unique n-gram fingerprint
Plagiarism detectionWord n-gramsCompare overlap between documents
Keyword extractionBigrams/trigrams”machine learning” more meaningful than “machine” alone
Sentiment featuresBigrams”not good”, “very bad” vs “not bad”, “very good”