spaCy
spaCy is Python’s most widely used NLP library for production systems. It combines speed (written in Cython), accuracy (neural models), and a clean API into a single package. One nlp(text) call tokenizes, tags, parses, and recognizes entities simultaneously.
Installation
pip install spacy
# Download an English modelpython -m spacy download en_core_web_sm # Small, fast (12MB)python -m spacy download en_core_web_md # Medium + word vectors (43MB)python -m spacy download en_core_web_lg # Large + word vectors (587MB)python -m spacy download en_core_web_trf # Transformer-based, highest accuracyCore Pipeline
import spacy
nlp = spacy.load("en_core_web_sm")text = "In 2025, Anthropic released Claude 3.5 Sonnet with improved reasoning and coding abilities."
doc = nlp(text)
# Tokensprint("Tokens:", [token.text for token in doc])
# Named entitiesprint("\nNamed Entities:")for ent in doc.ents: print(f" {ent.text:<25} {ent.label_:<12} {spacy.explain(ent.label_)}")
# Noun chunksprint("\nNoun Chunks:")for chunk in doc.noun_chunks: print(f" {chunk.text:<30} root: {chunk.root.text}")
# Dependenciesprint("\nDependencies:")for token in doc: print(f" {token.text:<15} {token.dep_:<10} → {token.head.text}")Token Attributes
nlp = spacy.load("en_core_web_sm")doc = nlp("The transformer architecture efficiently processes sequential text.")
for token in doc: print(f"{token.text:<15} pos: {token.pos_:<8} tag: {token.tag_:<8} " f"lemma: {token.lemma_:<15} stop: {token.is_stop}")Key token attributes:
token.text— original texttoken.lemma_— base formtoken.pos_— universal POS tagtoken.tag_— Penn Treebank tagtoken.dep_— dependency relationtoken.head— syntactic headtoken.is_stop— is it a stopword?token.is_punct— is it punctuation?token.ent_type_— entity type if part of entity
Named Entity Recognition
import spacyfrom spacy import displacy
nlp = spacy.load("en_core_web_sm")
text = """Google DeepMind published research in Nature in January 2025 showing thatAlphaFold 3 can predict the structure of DNA, RNA, and proteins with high accuracy."""doc = nlp(text)
for ent in doc.ents: print(f"{ent.text:<30} [{ent.label_}] {spacy.explain(ent.label_)}")
# Visualize (Jupyter)displacy.render(doc, style="ent", jupyter=True)Custom Pipeline Components
spaCy’s pipeline is composable. Add custom steps:
import spacyfrom spacy.language import Language
nlp = spacy.load("en_core_web_sm")
@Language.component("tech_entity_detector")def detect_tech_entities(doc): tech_keywords = {"llm", "gpt", "bert", "transformer", "langchain", "rag"} for token in doc: if token.text.lower() in tech_keywords: token.ent_type_ = "TECH" return doc
nlp.add_pipe("tech_entity_detector", after="ner")doc = nlp("We used RAG with a BERT encoder and GPT for generation.")# Now TECH entities are detectedProcessing Multiple Documents Efficiently
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser"]) # disable unused components
texts = [ "Machine learning models transform NLP workflows.", "OpenAI's API enables rapid prototyping of AI applications.", "spaCy's pipeline processes text at thousands of tokens per second.",]
# Batch processing — much faster than calling nlp() one at a timefor doc in nlp.pipe(texts, batch_size=50): entities = [(ent.text, ent.label_) for ent in doc.ents] print(f"Entities: {entities}")Training a Custom NER Model
import spacyfrom spacy.training import Exampleimport random
nlp = spacy.blank("en")
# Add NER componentner = nlp.add_pipe("ner")ner.add_label("FRAMEWORK")
# Training data: (text, {"entities": [(start, end, label)]})TRAIN_DATA = [ ("We built the pipeline with LangChain and LlamaIndex.", {"entities": [(28, 37, "FRAMEWORK"), (42, 53, "FRAMEWORK")]}), ("FastAPI powers our backend API service.", {"entities": [(0, 7, "FRAMEWORK")]}), ("The React frontend connects to the Flask server.", {"entities": [(4, 9, "FRAMEWORK"), (34, 39, "FRAMEWORK")]}),]
optimizer = nlp.begin_training()
for epoch in range(20): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: example = Example.from_dict(nlp.make_doc(text), annotations) nlp.update([example], sgd=optimizer, losses=losses) if epoch % 5 == 0: print(f"Epoch {epoch}: {losses}")
# Testdoc = nlp("We use Haystack and Chroma in our RAG pipeline.")print([(ent.text, ent.label_) for ent in doc.ents])spaCy with Transformer Models
import spacy
# Uses HuggingFace transformers under the hood# pip install spacy-transformersnlp = spacy.load("en_core_web_trf")
doc = nlp("Researchers at MIT published a breakthrough paper on quantum NLP.")print("NER (transformer-based):")for ent in doc.ents: print(f" {ent.text} → {ent.label_}")Pipeline Component Reference
| Component | What it adds | token.X |
|---|---|---|
tokenizer | Splits text into tokens | token.text |
tagger | POS tags | token.pos_, token.tag_ |
parser | Dependency parse | token.dep_, token.head |
ner | Named entities | token.ent_type_, doc.ents |
lemmatizer | Base forms | token.lemma_ |
senter | Sentence boundaries | doc.sents |
tok2vec | Shared encoder | (internal) |
Disable unused components to speed up processing:
nlp = spacy.load("en_core_web_sm", disable=["parser", "senter"])