Technology  /  NLP

💬 Natural Language Processing 40 guides · updated 2026

From tokenisation and embeddings to transformer-based language understanding — the NLP fundamentals that underpin every modern LLM.

OpenAI API for NLP

The OpenAI API provides access to GPT-4, GPT-4o, and other large language models for text generation, embeddings, classification, structured extraction, and more. It’s the fastest way to add high-quality language understanding to any Python application.


Installation and Setup

Terminal window
pip install openai
from openai import OpenAI
client = OpenAI(api_key="your-api-key") # Or set OPENAI_API_KEY env variable

Text Generation with Chat Completions

from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are an expert NLP engineer."},
{"role": "user", "content": "Explain the difference between BERT and GPT in 3 bullet points."}
],
temperature=0.7,
max_tokens=300
)
print(response.choices[0].message.content)
print(f"\nUsage: {response.usage.total_tokens} tokens")

Streaming Responses

from openai import OpenAI
client = OpenAI()
with client.chat.completions.stream(
model="gpt-4o-mini",
messages=[
{"role": "user", "content": "List 5 practical uses of NLP in healthcare."}
]
) as stream:
for chunk in stream:
delta = chunk.choices[0].delta.content
if delta:
print(delta, end="", flush=True)
print()

from openai import OpenAI
import numpy as np
client = OpenAI()
def get_embedding(text, model="text-embedding-3-small"):
response = client.embeddings.create(input=text, model=model)
return np.array(response.data[0].embedding)
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Knowledge base
documents = [
"Fine-tuning adapts a pretrained model to a specific task with labeled data.",
"RAG combines a retriever with a language model for knowledge-intensive QA.",
"Prompt engineering improves LLM outputs by crafting effective input prompts.",
"RLHF aligns language models with human preferences using reward signals.",
"Quantization reduces model size by using lower-precision weights."
]
doc_embeddings = [get_embedding(doc) for doc in documents]
def semantic_search(query, top_k=3):
query_emb = get_embedding(query)
scores = [(cosine_similarity(query_emb, emb), doc) for emb, doc in zip(doc_embeddings, documents)]
scores.sort(reverse=True)
return scores[:top_k]
results = semantic_search("How do I make LLMs follow human instructions?")
for score, doc in results:
print(f"[{score:.4f}] {doc}")

Structured Extraction with Function Calling

from openai import OpenAI
import json
client = OpenAI()
tools = [
{
"type": "function",
"function": {
"name": "extract_article_info",
"description": "Extract structured information from a news article",
"parameters": {
"type": "object",
"properties": {
"headline": {"type": "string", "description": "The article headline"},
"organizations": {"type": "array", "items": {"type": "string"}, "description": "Organizations mentioned"},
"people": {"type": "array", "items": {"type": "string"}, "description": "People mentioned"},
"date": {"type": "string", "description": "Date of the event if mentioned"},
"sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]}
},
"required": ["headline", "organizations", "sentiment"]
}
}
}
]
text = """
OpenAI announced a partnership with Microsoft to expand Azure's AI infrastructure in May 2025.
CEO Sam Altman stated the deal would accelerate deployment of GPT-5 to enterprise customers.
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": f"Extract information from this text:\n\n{text}"}],
tools=tools,
tool_choice={"type": "function", "function": {"name": "extract_article_info"}}
)
result = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
print(json.dumps(result, indent=2))

Multi-Turn Conversation

from openai import OpenAI
client = OpenAI()
messages = [
{"role": "system", "content": "You are an NLP tutor helping students understand concepts clearly."}
]
def chat(user_message):
messages.append({"role": "user", "content": user_message})
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
max_tokens=300
)
assistant_message = response.choices[0].message.content
messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
print(chat("What is tokenization in NLP?"))
print(chat("How does it differ between BERT and GPT?"))
print(chat("Can you give me a Python example using the Hugging Face tokenizer?"))

Text Classification with Few-Shot Prompting

from openai import OpenAI
client = OpenAI()
def classify_text(text, categories):
examples = """
Examples:
Text: "The GPU memory was exhausted during training" → Category: technical
Text: "The pasta dish needed more salt and garlic" → Category: food
Text: "Revenue declined 12% due to higher interest rates" → Category: finance
"""
prompt = f"""{examples}
Text: "{text}" → Category:"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": f"Classify the text into one of these categories: {', '.join(categories)}. Reply with only the category name."},
{"role": "user", "content": prompt}
],
max_tokens=10,
temperature=0
)
return response.choices[0].message.content.strip()
categories = ["technical", "food", "finance", "sports"]
texts = [
"The model's F1 score improved after adding more training data.",
"The soufflé collapsed when I opened the oven door.",
"The Federal Reserve held interest rates steady in June 2025.",
]
for text in texts:
label = classify_text(text, categories)
print(f"[{label}] {text}")

API Model Options (2025)

ModelContextSpeedBest For
gpt-4o128KMediumBest quality, complex tasks
gpt-4o-mini128KFastCost-efficient, most tasks
gpt-4-turbo128KMediumHigh accuracy, JSON mode
o1-preview128KSlowComplex reasoning
text-embedding-3-smallFastEmbeddings, low cost
text-embedding-3-largeMediumEmbeddings, highest quality

For most NLP tasks in 2025, gpt-4o-mini provides the best cost/performance tradeoff.