Technology  /  NLP

💬 Natural Language Processing 40 guides · updated 2026

From tokenisation and embeddings to transformer-based language understanding — the NLP fundamentals that underpin every modern LLM.

Build a Chatbot Using NLP

Chatbots range from simple rule-based responders to sophisticated conversational AI systems. This guide walks through three approaches — from a lightweight pattern-matching bot to an LLM-powered assistant.


Approach 1: Rule-Based with TF-IDF

The simplest production-viable chatbot uses TF-IDF vectors to find the most similar FAQ answer:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Knowledge base: questions and answers
faq = {
"What is NLP?": "NLP (Natural Language Processing) is the field of AI that enables computers to understand and generate human language.",
"What libraries are used for NLP in Python?": "Popular NLP libraries include spaCy, NLTK, Hugging Face Transformers, and TextBlob.",
"What is tokenization?": "Tokenization splits text into words, subwords, or characters for processing by NLP models.",
"What is sentiment analysis?": "Sentiment analysis classifies text as positive, negative, or neutral based on its emotional tone.",
"What are word embeddings?": "Word embeddings are dense numeric vectors that represent words in a continuous semantic space.",
"How does BERT work?": "BERT uses bidirectional transformer attention pretrained on masked language modeling to create contextual word representations.",
}
questions = list(faq.keys())
answers = list(faq.values())
vectorizer = TfidfVectorizer()
question_vectors = vectorizer.fit_transform(questions)
def chatbot_tfidf(user_input, threshold=0.2):
user_vec = vectorizer.transform([user_input])
scores = cosine_similarity(user_vec, question_vectors).flatten()
best_idx = np.argmax(scores)
if scores[best_idx] >= threshold:
return answers[best_idx]
return "I'm not sure I understand. Could you rephrase that?"
# Test the chatbot
test_inputs = [
"explain nlp to me",
"which python library for text processing",
"how does bert understand context",
"what's the weather today"
]
for inp in test_inputs:
response = chatbot_tfidf(inp)
print(f"User: {inp}")
print(f"Bot: {response}\n")

Approach 2: Intent Classification with Transformers

A more robust chatbot classifies the user’s intent and dispatches to a handler:

from transformers import pipeline
import json
# Define intents
intents = {
"greeting": ["hello", "hi", "good morning", "hey there", "what's up"],
"farewell": ["bye", "goodbye", "see you", "take care", "talk later"],
"help": ["help", "I need assistance", "can you help me", "support"],
"product_info": ["tell me about", "what is", "explain", "describe"],
"pricing": ["how much", "price", "cost", "fee", "subscription"],
}
# Build training data
train_texts, train_labels = [], []
label_to_idx = {}
for idx, (intent, examples) in enumerate(intents.items()):
label_to_idx[intent] = idx
for ex in examples:
train_texts.append(ex)
train_labels.append(intent)
# Zero-shot classification (no training required)
classifier = pipeline("zero-shot-classification")
intent_labels = list(intents.keys())
def classify_intent(user_input):
result = classifier(user_input, candidate_labels=intent_labels)
top_label = result["labels"][0]
top_score = result["scores"][0]
return top_label, top_score
# Intent handlers
def handle_intent(intent, user_input):
responses = {
"greeting": "Hello! I'm your NLP assistant. How can I help you today?",
"farewell": "Goodbye! Feel free to come back if you have more questions.",
"help": "I can answer questions about NLP, machine learning, and Python libraries.",
"product_info": "Could you specify which product or concept you'd like to know more about?",
"pricing": "For pricing information, please visit our pricing page or contact our sales team.",
}
return responses.get(intent, "I'm not sure how to help with that.")
# Chatbot loop
test_messages = [
"hey there!",
"can you help me understand something",
"tell me about word embeddings",
"how much does this service cost",
"talk to you later"
]
for msg in test_messages:
intent, confidence = classify_intent(msg)
response = handle_intent(intent, msg)
print(f"User: {msg}")
print(f"Intent: {intent} ({confidence:.3f})")
print(f"Bot: {response}\n")

Approach 3: Semantic Search Chatbot

Find the best answer from a knowledge base using sentence embeddings:

from sentence_transformers import SentenceTransformer, util
import torch
model = SentenceTransformer('all-MiniLM-L6-v2')
# Knowledge base
kb = [
{
"question": "What is retrieval-augmented generation?",
"answer": "RAG combines a retriever that searches a knowledge base with an LLM that generates answers grounded in the retrieved context."
},
{
"question": "How do I fine-tune a BERT model?",
"answer": "Fine-tune BERT by adding a task-specific head (e.g., linear classifier), then training on labeled examples for your target task using Hugging Face Trainer."
},
{
"question": "What is the difference between encoder and decoder transformers?",
"answer": "Encoders (BERT) read full context bidirectionally and are used for understanding tasks. Decoders (GPT) generate text autoregressively left-to-right."
},
{
"question": "When should I use TF-IDF instead of embeddings?",
"answer": "Use TF-IDF for keyword-based search, when you need interpretability, or when you need fast inference without a GPU. Use embeddings for semantic similarity and when word overlap doesn't capture meaning."
}
]
kb_questions = [item["question"] for item in kb]
kb_embeddings = model.encode(kb_questions, convert_to_tensor=True)
def semantic_chatbot(user_input, threshold=0.5):
query_emb = model.encode(user_input, convert_to_tensor=True)
scores = util.cos_sim(query_emb, kb_embeddings)[0]
best_idx = scores.argmax().item()
best_score = scores[best_idx].item()
if best_score >= threshold:
return kb[best_idx]["answer"], best_score
return "I don't have information about that yet. Try rephrasing or ask something else.", best_score
# Test
queries = [
"explain RAG to me",
"how do I train a BERT classifier",
"decoder vs encoder difference",
"should I use keyword search or vector search"
]
for q in queries:
response, score = semantic_chatbot(q)
print(f"Q [{score:.3f}]: {q}")
print(f"A: {response}\n")

Approach 4: LLM-Powered Chatbot with OpenAI

from openai import OpenAI
client = OpenAI()
class NLPChatbot:
def __init__(self, system_prompt):
self.messages = [{"role": "system", "content": system_prompt}]
def chat(self, user_message):
self.messages.append({"role": "user", "content": user_message})
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=self.messages,
max_tokens=500,
temperature=0.7
)
assistant_response = response.choices[0].message.content
self.messages.append({"role": "assistant", "content": assistant_response})
return assistant_response
def reset(self):
self.messages = self.messages[:1] # Keep system prompt only
bot = NLPChatbot(
system_prompt="You are an expert NLP tutor. Explain concepts clearly with examples. Keep responses concise."
)
# Multi-turn conversation
print(bot.chat("What's the difference between stemming and lemmatization?"))
print(bot.chat("Which one should I use in a search engine?"))
print(bot.chat("Give me a quick Python example."))

Which Approach to Choose?

ApproachComplexityAccuracyCostUse when
TF-IDF FAQLowMediumFreeSmall FAQ, fast setup
Zero-shot intentLowGoodFree (local)Intent routing without training
Semantic searchMediumHighFree (local)Rich knowledge base, semantic matching
LLM (OpenAI)LowBestAPI costGeneral conversation, complex questions

For production in 2025: combine semantic retrieval (RAG) with an LLM. The retriever fetches relevant context; the LLM generates a grounded, accurate answer.