NLP with spaCy: Complete Natural Language Processing Guide

What is spaCy?

spaCy is an industrial-strength NLP library designed for production use. It provides fast, accurate processing for tasks like tokenization, named entity recognition, and text classification.

spaCy vs NLTK

Feature	spaCy	NLTK
Speed	Fast	Slower
Models	Pre-trained	Build your own
Use Case	Production	Research/Learning
API	Object-oriented	Functional

Getting Started

Installation

pip install spacy

# Download language models
python -m spacy download en_core_web_sm  # Small
python -m spacy download en_core_web_md  # Medium (with word vectors)
python -m spacy download en_core_web_lg  # Large
python -m spacy download en_core_web_trf # Transformer-based

Basic Processing

import spacy

# Load model
nlp = spacy.load("en_core_web_sm")

# Process text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over tokens
for token in doc:
    print(f"{token.text:12} {token.pos_:6} {token.dep_:10} {token.head.text}")

Output:

Apple        PROPN  nsubj      looking
is           AUX    aux        looking
looking      VERB   ROOT       looking
at           ADP    prep       looking
buying       VERB   pcomp      at
U.K.         PROPN  compound   startup
startup      NOUN   dobj       buying
for          ADP    prep       buying
$            SYM    quantmod   billion
1            NUM    compound   billion
billion      NUM    pobj       for

Tokenization

Token Attributes

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I don't like spiders, but I love Python!")

for token in doc:
    print(f"""
Token: {token.text}
Lemma: {token.lemma_}
POS: {token.pos_}
Tag: {token.tag_}
Dependency: {token.dep_}
Shape: {token.shape_}
Is Alpha: {token.is_alpha}
Is Stop: {token.is_stop}
""")

Custom Tokenization

from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
import re

nlp = spacy.load("en_core_web_sm")

# Custom infixes
infixes = list(nlp.Defaults.infixes)
infixes.append(r'(?<=[a-zA-Z])@(?=[a-zA-Z])')  # Split on @

infix_regex = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_regex.finditer

# Add special cases
nlp.tokenizer.add_special_case("don't", [
    {"ORTH": "do"},
    {"ORTH": "n't"}
])

Named Entity Recognition (NER)

Extract Entities

import spacy

nlp = spacy.load("en_core_web_sm")
text = """Apple Inc. was founded by Steve Jobs in Cupertino, California.
The company is worth over $2 trillion as of 2024."""

doc = nlp(text)

# Extract entities
for ent in doc.ents:
    print(f"{ent.text:20} {ent.label_:10} {spacy.explain(ent.label_)}")

Output:

Apple Inc.           ORG        Companies, agencies, institutions
Steve Jobs           PERSON     People, including fictional
Cupertino            GPE        Countries, cities, states
California           GPE        Countries, cities, states
over $2 trillion     MONEY      Monetary values
2024                 DATE       Absolute or relative dates

Custom NER Training

import spacy
from spacy.training import Example
import random

# Create blank model
nlp = spacy.blank("en")

# Add NER pipeline
ner = nlp.add_pipe("ner")

# Add labels
ner.add_label("PRODUCT")
ner.add_label("TECH")

# Training data
TRAIN_DATA = [
    ("iPhone 15 is Apple's latest smartphone", {"entities": [(0, 9, "PRODUCT"), (13, 18, "ORG")]}),
    ("TensorFlow is a machine learning framework", {"entities": [(0, 10, "TECH")]}),
    ("Microsoft released Windows 11", {"entities": [(0, 9, "ORG"), (19, 29, "PRODUCT")]}),
]

# Train
optimizer = nlp.begin_training()

for epoch in range(30):
    random.shuffle(TRAIN_DATA)
    losses = {}

    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.5, losses=losses)

    print(f"Epoch {epoch}: {losses}")

# Test
doc = nlp("The new MacBook Pro features M3 chip")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

Text Classification

Train Text Classifier

import spacy
from spacy.training import Example

# Create model with textcat
nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat")

# Add labels
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

# Training data
TRAIN_DATA = [
    ("This movie is amazing!", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
    ("I hated every minute of it", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
    ("Best experience ever", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
    ("Terrible waste of time", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
]

# Train
optimizer = nlp.begin_training()

for epoch in range(20):
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print(f"Epoch {epoch}: {losses}")

# Predict
doc = nlp("This product exceeded my expectations")
print(doc.cats)

Multi-label Classification

import spacy
from spacy.training import Example

nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat_multilabel")

# Add multiple labels
labels = ["TECH", "BUSINESS", "SPORTS", "ENTERTAINMENT"]
for label in labels:
    textcat.add_label(label)

TRAIN_DATA = [
    ("Apple releases new iPhone", {"cats": {"TECH": 1.0, "BUSINESS": 1.0, "SPORTS": 0.0, "ENTERTAINMENT": 0.0}}),
    ("Lakers win championship", {"cats": {"TECH": 0.0, "BUSINESS": 0.0, "SPORTS": 1.0, "ENTERTAINMENT": 1.0}}),
]

# Train and use same as single-label

Dependency Parsing

Analyze Syntax

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The quick brown fox jumps over the lazy dog")

# Dependency tree
for token in doc:
    print(f"{token.text:10} -> {token.head.text:10} ({token.dep_})")

# Get noun chunks
for chunk in doc.noun_chunks:
    print(f"Chunk: {chunk.text}, Root: {chunk.root.text}, Head: {chunk.root.head.text}")

Extract Relationships

import spacy

nlp = spacy.load("en_core_web_sm")

def extract_subject_verb_object(doc):
    """Extract SVO triplets from text."""
    triplets = []

    for token in doc:
        if token.dep_ == "ROOT":
            verb = token
            subject = None
            obj = None

            for child in token.children:
                if child.dep_ in ["nsubj", "nsubjpass"]:
                    subject = " ".join([t.text for t in child.subtree])
                elif child.dep_ in ["dobj", "pobj", "attr"]:
                    obj = " ".join([t.text for t in child.subtree])

            if subject and obj:
                triplets.append((subject, verb.text, obj))

    return triplets

text = "John bought a new car. Mary loves chocolate."
doc = nlp(text)

for subj, verb, obj in extract_subject_verb_object(doc):
    print(f"{subj} -> {verb} -> {obj}")

Word Vectors and Similarity

Semantic Similarity

import spacy

# Need medium or large model for vectors
nlp = spacy.load("en_core_web_md")

# Document similarity
doc1 = nlp("I like cats")
doc2 = nlp("I love dogs")
doc3 = nlp("The weather is nice")

print(f"Cats vs Dogs: {doc1.similarity(doc2):.3f}")
print(f"Cats vs Weather: {doc1.similarity(doc3):.3f}")

# Word similarity
word1 = nlp("king")
word2 = nlp("queen")
word3 = nlp("car")

print(f"King vs Queen: {word1.similarity(word2):.3f}")
print(f"King vs Car: {word1.similarity(word3):.3f}")

# Find similar words
from scipy.spatial.distance import cosine

def find_similar_words(word: str, nlp, top_n: int = 10):
    word_vec = nlp(word).vector

    similarities = []
    for vocab_word in nlp.vocab:
        if vocab_word.has_vector and vocab_word.is_alpha:
            sim = 1 - cosine(word_vec, vocab_word.vector)
            similarities.append((vocab_word.text, sim))

    return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]

Custom Pipelines

Add Custom Component

import spacy
from spacy.language import Language
from spacy.tokens import Doc, Span

# Register custom component
@Language.component("custom_sentencizer")
def custom_sentencizer(doc):
    for token in doc[:-1]:
        if token.text in [".", "!", "?"]:
            doc[token.i + 1].is_sent_start = True
        else:
            doc[token.i + 1].is_sent_start = False
    return doc

# Add to pipeline
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("custom_sentencizer", before="parser")

# Custom entity component
@Language.component("tech_entity_ruler")
def tech_entity_ruler(doc):
    tech_terms = ["Python", "JavaScript", "React", "TensorFlow"]

    new_ents = list(doc.ents)
    for token in doc:
        if token.text in tech_terms:
            span = Span(doc, token.i, token.i + 1, label="TECH")
            new_ents.append(span)

    doc.ents = new_ents
    return doc

nlp.add_pipe("tech_entity_ruler", after="ner")

Entity Ruler

import spacy
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")

# Add entity ruler
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define patterns
patterns = [
    {"label": "TECH", "pattern": "Python"},
    {"label": "TECH", "pattern": "JavaScript"},
    {"label": "TECH", "pattern": [{"LOWER": "machine"}, {"LOWER": "learning"}]},
    {"label": "FRAMEWORK", "pattern": [{"LOWER": "react"}, {"LOWER": "native"}]},
]

ruler.add_patterns(patterns)

# Test
doc = nlp("I'm learning Python and machine learning with React Native")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

Text Preprocessing

Cleaning Pipeline

import spacy
import re

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text: str) -> str:
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Process with spaCy
    doc = nlp(text.lower())

    # Lemmatize and remove stopwords
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop
        and not token.is_punct
        and token.is_alpha
        and len(token) > 2
    ]

    return " ".join(tokens)

# Usage
text = "I'm loving this new product! Check out https://example.com for more info."
cleaned = preprocess_text(text)
print(cleaned)  # "love new product check info"

Batch Processing

import spacy
from typing import List, Generator

nlp = spacy.load("en_core_web_sm")

def process_texts_batch(texts: List[str], batch_size: int = 100) -> Generator:
    """Process texts in batches for efficiency."""
    for doc in nlp.pipe(texts, batch_size=batch_size, n_process=4):
        yield {
            "text": doc.text,
            "entities": [(ent.text, ent.label_) for ent in doc.ents],
            "noun_chunks": [chunk.text for chunk in doc.noun_chunks],
            "sentences": [sent.text for sent in doc.sents]
        }

# Usage
texts = [
    "Apple announced new products.",
    "Microsoft acquired a startup.",
    "Google released an update."
]

for result in process_texts_batch(texts):
    print(result)

FastAPI NLP Service

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import spacy

app = FastAPI()
nlp = spacy.load("en_core_web_sm")

class TextInput(BaseModel):
    text: str

class Entity(BaseModel):
    text: str
    label: str
    start: int
    end: int

class NLPResponse(BaseModel):
    entities: List[Entity]
    noun_chunks: List[str]
    sentences: List[str]
    tokens: List[dict]

@app.post("/analyze", response_model=NLPResponse)
async def analyze_text(input: TextInput):
    doc = nlp(input.text)

    return NLPResponse(
        entities=[
            Entity(text=ent.text, label=ent.label_, start=ent.start_char, end=ent.end_char)
            for ent in doc.ents
        ],
        noun_chunks=[chunk.text for chunk in doc.noun_chunks],
        sentences=[sent.text for sent in doc.sents],
        tokens=[
            {"text": t.text, "pos": t.pos_, "lemma": t.lemma_}
            for t in doc
        ]
    )

@app.post("/similarity")
async def calculate_similarity(text1: str, text2: str):
    doc1 = nlp(text1)
    doc2 = nlp(text2)

    return {"similarity": doc1.similarity(doc2)}

Summary

Task	Method
Tokenization	`nlp(text)`
NER	`doc.ents`
POS Tagging	`token.pos_`
Dependency Parse	`token.dep_`
Similarity	`doc.similarity()`
Classification	`textcat` pipeline

spaCy provides production-ready NLP capabilities with excellent performance and easy-to-use APIs.