spaCy

spaCy is Python’s most widely used NLP library for production systems. It combines speed (written in Cython), accuracy (neural models), and a clean API into a single package. One nlp(text) call tokenizes, tags, parses, and recognizes entities simultaneously.

Installation

pip install spacy

# Download an English model
python -m spacy download en_core_web_sm    # Small, fast (12MB)
python -m spacy download en_core_web_md    # Medium + word vectors (43MB)
python -m spacy download en_core_web_lg    # Large + word vectors (587MB)
python -m spacy download en_core_web_trf   # Transformer-based, highest accuracy

Core Pipeline

import spacy

nlp = spacy.load("en_core_web_sm")
text = "In 2025, Anthropic released Claude 3.5 Sonnet with improved reasoning and coding abilities."

doc = nlp(text)

# Tokens
print("Tokens:", [token.text for token in doc])

# Named entities
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"  {ent.text:<25} {ent.label_:<12} {spacy.explain(ent.label_)}")

# Noun chunks
print("\nNoun Chunks:")
for chunk in doc.noun_chunks:
    print(f"  {chunk.text:<30} root: {chunk.root.text}")

# Dependencies
print("\nDependencies:")
for token in doc:
    print(f"  {token.text:<15} {token.dep_:<10} → {token.head.text}")

Token Attributes

nlp = spacy.load("en_core_web_sm")
doc = nlp("The transformer architecture efficiently processes sequential text.")

for token in doc:
    print(f"{token.text:<15} pos: {token.pos_:<8} tag: {token.tag_:<8} "
          f"lemma: {token.lemma_:<15} stop: {token.is_stop}")

Key token attributes:

token.text — original text
token.lemma_ — base form
token.pos_ — universal POS tag
token.tag_ — Penn Treebank tag
token.dep_ — dependency relation
token.head — syntactic head
token.is_stop — is it a stopword?
token.is_punct — is it punctuation?
token.ent_type_ — entity type if part of entity

Named Entity Recognition

import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

text = """
Google DeepMind published research in Nature in January 2025 showing that
AlphaFold 3 can predict the structure of DNA, RNA, and proteins with high accuracy.
"""
doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text:<30} [{ent.label_}] {spacy.explain(ent.label_)}")

# Visualize (Jupyter)
displacy.render(doc, style="ent", jupyter=True)

Custom Pipeline Components

spaCy’s pipeline is composable. Add custom steps:

import spacy
from spacy.language import Language

nlp = spacy.load("en_core_web_sm")

@Language.component("tech_entity_detector")
def detect_tech_entities(doc):
    tech_keywords = {"llm", "gpt", "bert", "transformer", "langchain", "rag"}
    for token in doc:
        if token.text.lower() in tech_keywords:
            token.ent_type_ = "TECH"
    return doc

nlp.add_pipe("tech_entity_detector", after="ner")
doc = nlp("We used RAG with a BERT encoder and GPT for generation.")
# Now TECH entities are detected

Processing Multiple Documents Efficiently

import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser"])  # disable unused components

texts = [
    "Machine learning models transform NLP workflows.",
    "OpenAI's API enables rapid prototyping of AI applications.",
    "spaCy's pipeline processes text at thousands of tokens per second.",
]

# Batch processing — much faster than calling nlp() one at a time
for doc in nlp.pipe(texts, batch_size=50):
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"Entities: {entities}")

Training a Custom NER Model

import spacy
from spacy.training import Example
import random

nlp = spacy.blank("en")

# Add NER component
ner = nlp.add_pipe("ner")
ner.add_label("FRAMEWORK")

# Training data: (text, {"entities": [(start, end, label)]})
TRAIN_DATA = [
    ("We built the pipeline with LangChain and LlamaIndex.", {"entities": [(28, 37, "FRAMEWORK"), (42, 53, "FRAMEWORK")]}),
    ("FastAPI powers our backend API service.", {"entities": [(0, 7, "FRAMEWORK")]}),
    ("The React frontend connects to the Flask server.", {"entities": [(4, 9, "FRAMEWORK"), (34, 39, "FRAMEWORK")]}),
]

optimizer = nlp.begin_training()

for epoch in range(20):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], sgd=optimizer, losses=losses)
    if epoch % 5 == 0:
        print(f"Epoch {epoch}: {losses}")

# Test
doc = nlp("We use Haystack and Chroma in our RAG pipeline.")
print([(ent.text, ent.label_) for ent in doc.ents])

spaCy with Transformer Models

import spacy

# Uses HuggingFace transformers under the hood
# pip install spacy-transformers
nlp = spacy.load("en_core_web_trf")

doc = nlp("Researchers at MIT published a breakthrough paper on quantum NLP.")
print("NER (transformer-based):")
for ent in doc.ents:
    print(f"  {ent.text} → {ent.label_}")

Pipeline Component Reference

Component	What it adds	`token.X`
`tokenizer`	Splits text into tokens	`token.text`
`tagger`	POS tags	`token.pos_`, `token.tag_`
`parser`	Dependency parse	`token.dep_`, `token.head`
`ner`	Named entities	`token.ent_type_`, `doc.ents`
`lemmatizer`	Base forms	`token.lemma_`
`senter`	Sentence boundaries	`doc.sents`
`tok2vec`	Shared encoder	(internal)

Disable unused components to speed up processing:

nlp = spacy.load("en_core_web_sm", disable=["parser", "senter"])