Extract the Meaning of a Paragraph Using NLP
Meaning extraction converts raw paragraphs into structured knowledge — key topics, entities, intent, and relationships. This guide covers a progression from lightweight rule-based extraction to LLM-powered semantic understanding.
Keyword Extraction with TF-IDF
The simplest approach: words with high TF-IDF scores relative to a background corpus are the most distinctive and meaningful:
from sklearn.feature_extraction.text import TfidfVectorizerimport numpy as np
paragraph = """Retrieval-Augmented Generation (RAG) combines dense vector retrieval withlarge language models to produce accurate, grounded answers. Instead ofrelying solely on parameters stored in the model weights, RAG systemsdynamically fetch relevant documents from an external knowledge base andprovide them as context to the LLM during inference. This approachsignificantly reduces hallucination and keeps the model's knowledge up to date."""
# Create a background corpus to compute IDF againstbackground = [ "Language models are trained on large text corpora.", "Neural networks learn representations from data.", "Vector search enables fast retrieval from large datasets.", "Machine learning models make predictions from input features."]
all_docs = [paragraph] + backgroundvectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))tfidf_matrix = vectorizer.fit_transform(all_docs)
# Keywords from the paragraph (first document)paragraph_tfidf = tfidf_matrix[0].toarray()[0]feature_names = vectorizer.get_feature_names_out()top_indices = np.argsort(paragraph_tfidf)[::-1][:10]keywords = [(feature_names[i], round(paragraph_tfidf[i], 4)) for i in top_indices if paragraph_tfidf[i] > 0]
print("Extracted keywords:")for keyword, score in keywords: print(f" {keyword:<35} {score}")Named Entity and Noun Chunk Extraction
spaCy extracts the “who”, “what”, “where” from a paragraph:
import spacy
nlp = spacy.load("en_core_web_sm")
paragraph = """In March 2025, NVIDIA announced the Blackwell Ultra GPU architecture at GTC conference in San Jose.CEO Jensen Huang showcased performance improvements of 40x over previous H100 models,targeting AI training workloads for companies like Google, Microsoft, and Amazon."""
doc = nlp(paragraph)
print("=== Named Entities ===")for ent in doc.ents: print(f" {ent.text:<30} [{ent.label_}]")
print("\n=== Key Noun Phrases ===")for chunk in doc.noun_chunks: if len(chunk.text.split()) > 1: # multi-word phrases only print(f" {chunk.text:<40} root: {chunk.root.text}")
print("\n=== Main Verbs (actions) ===")for token in doc: if token.pos_ == "VERB" and not token.is_stop: print(f" {token.text:<20} lemma: {token.lemma_}")Extracting the Main Claim
Find the root verb and its subject-object structure:
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_main_claim(text): doc = nlp(text) claims = []
for token in doc: if token.dep_ == "ROOT" and token.pos_ == "VERB": subject = [w.text for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")] obj = [w.text for w in token.rights if w.dep_ in ("dobj", "pobj", "attr")]
subject_full = next( (chunk.text for chunk in doc.noun_chunks if any(w.text in chunk.text for w in token.lefts)), subject[0] if subject else "unknown" )
claims.append({ "subject": subject_full, "verb": token.lemma_, "object": obj[0] if obj else "" })
return claims
paragraph = "Large language models have fundamentally changed how developers build software applications."claims = extract_main_claim(paragraph)for claim in claims: print(f"Subject: {claim['subject']}") print(f"Action: {claim['verb']}") print(f"Object: {claim['object']}")Automatic Summarization
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
long_paragraph = """The development of large language models has accelerated dramatically since the introductionof the transformer architecture in 2017. These models, trained on hundreds of billions of wordsfrom the internet, books, and code, have demonstrated remarkable capabilities in understandingand generating human language. They can write essays, solve math problems, generate code,translate between languages, and answer complex questions. However, they also suffer fromsignificant limitations, including hallucination of false information, sensitivity to promptphrasing, and difficulty with tasks requiring true reasoning rather than pattern matching.Researchers continue to work on alignment techniques, better evaluation benchmarks, andmore efficient training methods to address these shortcomings."""
summary = summarizer(long_paragraph, max_length=80, min_length=30, do_sample=False)print("Summary:")print(summary[0]['summary_text'])LLM-Powered Meaning Extraction
For the most comprehensive extraction, use an LLM to produce structured output:
from openai import OpenAIimport json
client = OpenAI()
def extract_paragraph_meaning(paragraph): prompt = f"""Extract the following from this paragraph and return as JSON:1. "main_topic": The primary subject in 3-5 words2. "key_points": List of 3-5 main points as bullet strings3. "entities": Named entities as {{name: type}} dict4. "sentiment": Overall tone (positive/negative/neutral)5. "intent": What is the author trying to communicate? (1 sentence)
Paragraph:"{paragraph}"""" response = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], response_format={"type": "json_object"}, temperature=0 ) return json.loads(response.choices[0].message.content)
paragraph = """Anthropic's Constitutional AI approach trains language models to be helpful, harmless,and honest by having the model critique and revise its own outputs based on a set ofprinciples. Released in 2022, this technique has shown promising results in reducingharmful outputs compared to standard RLHF methods, while maintaining model capabilityon standard benchmarks."""
result = extract_paragraph_meaning(paragraph)print(json.dumps(result, indent=2))Complete Pipeline
import spacyfrom sklearn.feature_extraction.text import TfidfVectorizerimport numpy as np
nlp = spacy.load("en_core_web_sm")
def analyze_paragraph(text): doc = nlp(text)
# Entities entities = {ent.text: ent.label_ for ent in doc.ents}
# Noun phrases key_phrases = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]
# Main verbs main_verbs = [token.lemma_ for token in doc if token.pos_ == "VERB" and not token.is_stop]
# Keywords via TF-IDF (against itself) vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10) tfidf = vectorizer.fit_transform([text]) keywords = vectorizer.get_feature_names_out().tolist()
return { "entities": entities, "key_phrases": list(set(key_phrases))[:5], "main_verbs": list(set(main_verbs))[:5], "keywords": keywords[:8] }
para = "Mistral AI released Mixtral 8x7B, a sparse mixture-of-experts model that matches GPT-3.5 quality at a fraction of the computational cost."result = analyze_paragraph(para)for key, value in result.items(): print(f"{key}: {value}")