Natural Language Processing with Python: Complete Guide

What is Natural Language Processing?

Natural Language Processing (NLP) enables computers to understand, interpret, and generate human language. It powers chatbots, sentiment analysis, translation, and much more.

Installation

pip install nltk spacy transformers
python -m spacy download en_core_web_sm

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

Text Preprocessing

Tokenization

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Hello World! This is NLP. It's amazing, isn't it?"

# Sentence tokenization
sentences = sent_tokenize(text)
print(sentences)
# ['Hello World!', 'This is NLP.', "It's amazing, isn't it?"]

# Word tokenization
words = word_tokenize(text)
print(words)
# ['Hello', 'World', '!', 'This', 'is', 'NLP', '.', 'It', "'s", 'amazing', ',', 'is', "n't", 'it', '?']

Lowercasing and Punctuation Removal

import re
import string

def clean_text(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

text = "Hello World!!! This is   NLP..."
print(clean_text(text))  # "hello world this is nlp"

Stop Words Removal

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text.lower())
    filtered = [w for w in words if w not in stop_words and w.isalnum()]
    return ' '.join(filtered)

text = "This is a sample sentence showing off stop word filtration"
print(remove_stopwords(text))
# "sample sentence showing stop word filtration"

Stemming and Lemmatization

from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["running", "runs", "ran", "easily", "fairly", "studies", "studying"]

# Stemming (crude, rule-based)
stems = [stemmer.stem(w) for w in words]
print(stems)  # ['run', 'run', 'ran', 'easili', 'fairli', 'studi', 'studi']

# Lemmatization (dictionary-based, more accurate)
lemmas = [lemmatizer.lemmatize(w, pos='v') for w in words]
print(lemmas)  # ['run', 'run', 'run', 'easily', 'fairly', 'study', 'study']

Text Preprocessing Pipeline

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def preprocess(self, text):
        # Lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)

        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)

        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and len(token) > 2
        ]

        return ' '.join(tokens)

# Usage
preprocessor = TextPreprocessor()
text = "Check out this amazing website: https://example.com! It's the BEST!!!"
print(preprocessor.preprocess(text))
# "check amazing website best"

SpaCy for NLP

Basic Processing

import spacy

nlp = spacy.load('en_core_web_sm')

text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

# Tokenization
for token in doc:
    print(f"{token.text:12} {token.pos_:6} {token.dep_:10} {token.lemma_}")

# Output:
# Apple        PROPN  nsubj      Apple
# is           AUX    aux        be
# looking      VERB   ROOT       look
# at           ADP    prep       at
# buying       VERB   pcomp      buy
# U.K.         PROPN  compound   U.K.
# startup      NOUN   dobj       startup
# for          ADP    prep       for
# $            SYM    quantmod   $
# 1            NUM    compound   1
# billion      NUM    pobj       billion

Named Entity Recognition (NER)

import spacy

nlp = spacy.load('en_core_web_sm')

text = """
Apple Inc. was founded by Steve Jobs in California.
The company is worth over $2 trillion as of 2024.
"""

doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text:20} {ent.label_:10} {spacy.explain(ent.label_)}")

# Output:
# Apple Inc.           ORG        Companies, agencies, institutions
# Steve Jobs           PERSON     People, including fictional
# California           GPE        Countries, cities, states
# over $2 trillion     MONEY      Monetary values
# 2024                 DATE       Absolute or relative dates

Dependency Parsing

import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
doc = nlp("The quick brown fox jumps over the lazy dog")

# Print dependencies
for token in doc:
    print(f"{token.text} -> {token.head.text} ({token.dep_})")

# Visualize (opens in browser)
displacy.serve(doc, style="dep")

Sentiment Analysis

Using TextBlob

from textblob import TextBlob

def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # -1 to 1
    subjectivity = blob.sentiment.subjectivity  # 0 to 1

    if polarity > 0.1:
        sentiment = "Positive"
    elif polarity < -0.1:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    return {
        "text": text,
        "sentiment": sentiment,
        "polarity": polarity,
        "subjectivity": subjectivity
    }

# Examples
texts = [
    "I love this product! It's amazing!",
    "This is the worst experience ever.",
    "The weather is okay today."
]

for text in texts:
    result = analyze_sentiment(text)
    print(f"{result['sentiment']:10} ({result['polarity']:.2f}): {text}")

# Output:
# Positive   (0.62): I love this product! It's amazing!
# Negative   (-0.80): This is the worst experience ever.
# Neutral    (0.00): The weather is okay today.

Using Transformers (BERT)

from transformers import pipeline

# Load sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

texts = [
    "I absolutely love this movie!",
    "The service was terrible and slow.",
    "It's an average product, nothing special."
]

results = sentiment_analyzer(texts)
for text, result in zip(texts, results):
    print(f"{result['label']:10} ({result['score']:.2f}): {text}")

Text Classification

TF-IDF with Scikit-learn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Sample data
texts = [
    "Great product, highly recommend!",
    "Terrible quality, waste of money",
    "Average item, does the job",
    "Best purchase ever, love it!",
    "Disappointed with this product",
    # ... more data
]
labels = ["positive", "negative", "neutral", "positive", "negative"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('clf', MultinomialNB())
])

# Train
pipeline.fit(X_train, y_train)

# Predict
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

Word Embeddings

Word2Vec

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Sample corpus
corpus = [
    "machine learning is fascinating",
    "deep learning uses neural networks",
    "natural language processing is useful",
    "python is great for data science"
]

# Tokenize
tokenized = [word_tokenize(sent.lower()) for sent in corpus]

# Train Word2Vec
model = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

# Find similar words
similar = model.wv.most_similar("learning", topn=5)
print(similar)

# Word vector
vector = model.wv["machine"]
print(f"Vector shape: {vector.shape}")

# Similarity between words
similarity = model.wv.similarity("machine", "learning")
print(f"Similarity: {similarity}")

Practical Applications

Text Summarization

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

text = """
Machine learning is a subset of artificial intelligence that provides
systems the ability to automatically learn and improve from experience
without being explicitly programmed. Machine learning focuses on the
development of computer programs that can access data and use it to
learn for themselves. The process begins with observations or data,
such as examples, direct experience, or instruction, in order to look
for patterns in data and make better decisions in the future.
"""

summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
print(summary[0]['summary_text'])

Question Answering

from transformers import pipeline

qa_pipeline = pipeline("question-answering")

context = """
Python is a high-level programming language created by Guido van Rossum.
It was first released in 1991. Python emphasizes code readability and
supports multiple programming paradigms including procedural, object-oriented,
and functional programming.
"""

questions = [
    "Who created Python?",
    "When was Python released?",
    "What does Python emphasize?"
]

for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Q: {question}")
    print(f"A: {result['answer']} (confidence: {result['score']:.2f})\n")

Text Generation

from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")

prompt = "Machine learning is"
generated = generator(
    prompt,
    max_length=50,
    num_return_sequences=1,
    temperature=0.7
)

print(generated[0]['generated_text'])

Summary

Task	Library/Tool
Tokenization	NLTK, spaCy
Lemmatization	NLTK, spaCy
NER	spaCy, Transformers
Sentiment	TextBlob, Transformers
Classification	Scikit-learn
Embeddings	Gensim, Transformers
Generation	Transformers (GPT)

NLP enables powerful text analysis and generation capabilities. Start with basic preprocessing, then explore advanced deep learning models for production applications.