Python 6 min read

Vector Databases Explained: Essential Guide for AI Applications

Learn vector databases for AI and ML applications. Compare Pinecone, Chroma, Weaviate, and build semantic search with embeddings.

MR

Moshiour Rahman

Advertisement

What are Vector Databases?

Vector databases store and search high-dimensional vectors (embeddings) efficiently. They power semantic search, recommendation systems, and RAG applications by finding similar items based on meaning rather than exact matches.

Traditional SearchVector Search
Keyword matchingSemantic meaning
Exact matchesSimilar concepts
SQL queriesNearest neighbor
Structured dataEmbeddings

Understanding Embeddings

What are Embeddings?

Embeddings are numerical representations of data (text, images, audio) that capture semantic meaning in a vector space.

from openai import OpenAI

client = OpenAI()

def get_embedding(text: str) -> list[float]:
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

# Create embeddings
text1 = "Machine learning is fascinating"
text2 = "AI and ML are interesting topics"
text3 = "I love cooking pasta"

emb1 = get_embedding(text1)
emb2 = get_embedding(text2)
emb3 = get_embedding(text3)

print(f"Embedding dimension: {len(emb1)}")  # 1536

Similarity Metrics

import numpy as np

def cosine_similarity(a: list, b: list) -> float:
    a, b = np.array(a), np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def euclidean_distance(a: list, b: list) -> float:
    return np.linalg.norm(np.array(a) - np.array(b))

# Similar texts have high cosine similarity
print(f"text1 vs text2: {cosine_similarity(emb1, emb2):.4f}")  # ~0.85
print(f"text1 vs text3: {cosine_similarity(emb1, emb3):.4f}")  # ~0.65

ChromaDB

Installation and Setup

pip install chromadb
import chromadb
from chromadb.config import Settings

# Create client
client = chromadb.Client()

# Or persistent client
client = chromadb.PersistentClient(path="./chroma_data")

# Create collection
collection = client.create_collection(
    name="documents",
    metadata={"hnsw:space": "cosine"}  # Distance metric
)

Basic Operations

# Add documents
collection.add(
    documents=[
        "Machine learning enables computers to learn from data",
        "Deep learning uses neural networks with many layers",
        "Natural language processing helps computers understand text",
        "Computer vision allows machines to interpret images"
    ],
    metadatas=[
        {"category": "ml", "year": 2024},
        {"category": "dl", "year": 2024},
        {"category": "nlp", "year": 2024},
        {"category": "cv", "year": 2024}
    ],
    ids=["doc1", "doc2", "doc3", "doc4"]
)

# Query
results = collection.query(
    query_texts=["How do neural networks work?"],
    n_results=2
)

print(results["documents"])
print(results["distances"])

With Custom Embeddings

from chromadb.utils import embedding_functions

# OpenAI embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key="your-api-key",
    model_name="text-embedding-3-small"
)

collection = client.create_collection(
    name="openai_docs",
    embedding_function=openai_ef
)

# Add documents (embeddings generated automatically)
collection.add(
    documents=["Your text here"],
    ids=["id1"]
)

Filtering

# Query with metadata filter
results = collection.query(
    query_texts=["neural networks"],
    n_results=5,
    where={"category": "dl"},
    where_document={"$contains": "deep"}
)

# Complex filters
results = collection.query(
    query_texts=["AI applications"],
    where={
        "$and": [
            {"category": {"$in": ["ml", "dl"]}},
            {"year": {"$gte": 2023}}
        ]
    }
)

Pinecone

Setup

pip install pinecone-client
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="your-api-key")

# Create index
pc.create_index(
    name="my-index",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

index = pc.Index("my-index")

Operations

from openai import OpenAI

client = OpenAI()

def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

# Upsert vectors
vectors = [
    {
        "id": "doc1",
        "values": get_embedding("Machine learning basics"),
        "metadata": {"category": "ml", "source": "blog"}
    },
    {
        "id": "doc2",
        "values": get_embedding("Deep learning fundamentals"),
        "metadata": {"category": "dl", "source": "docs"}
    }
]

index.upsert(vectors=vectors)

# Query
query_embedding = get_embedding("How does ML work?")
results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)

for match in results["matches"]:
    print(f"ID: {match['id']}, Score: {match['score']:.4f}")

Namespaces

# Different namespaces for different data types
index.upsert(vectors=vectors, namespace="blog-posts")
index.upsert(vectors=vectors, namespace="documentation")

# Query specific namespace
results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="blog-posts"
)

Weaviate

Setup

pip install weaviate-client
import weaviate
from weaviate.classes.init import Auth

client = weaviate.connect_to_wcs(
    cluster_url="your-cluster-url",
    auth_credentials=Auth.api_key("your-api-key")
)

# Or local instance
client = weaviate.connect_to_local()

Schema and Data

from weaviate.classes.config import Configure, Property, DataType

# Create collection
client.collections.create(
    name="Article",
    vectorizer_config=Configure.Vectorizer.text2vec_openai(),
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="content", data_type=DataType.TEXT),
        Property(name="category", data_type=DataType.TEXT)
    ]
)

# Add data
articles = client.collections.get("Article")
articles.data.insert({
    "title": "Introduction to Machine Learning",
    "content": "Machine learning is a subset of AI...",
    "category": "ML"
})
# Semantic search
response = articles.query.near_text(
    query="artificial intelligence",
    limit=5
)

for obj in response.objects:
    print(obj.properties["title"])

# Hybrid search (vector + keyword)
response = articles.query.hybrid(
    query="machine learning basics",
    limit=5,
    alpha=0.5  # Balance between vector and keyword
)
pip install faiss-cpu  # or faiss-gpu
import faiss
import numpy as np
from openai import OpenAI

client = OpenAI()

# Create embeddings
texts = [
    "Machine learning fundamentals",
    "Deep learning with neural networks",
    "Natural language processing basics",
    "Computer vision applications"
]

embeddings = []
for text in texts:
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    embeddings.append(response.data[0].embedding)

# Convert to numpy array
embeddings = np.array(embeddings).astype('float32')

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(embeddings)

print(f"Total vectors: {index.ntotal}")

# Search
query = "How do neural networks learn?"
query_embedding = np.array([get_embedding(query)]).astype('float32')

distances, indices = index.search(query_embedding, k=2)

for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: {texts[idx]} (distance: {distances[0][i]:.4f})")

Saving and Loading

# Save index
faiss.write_index(index, "vectors.faiss")

# Load index
loaded_index = faiss.read_index("vectors.faiss")

Complete Example

import chromadb
from openai import OpenAI
from typing import List, Dict

class SemanticSearch:
    def __init__(self, collection_name: str = "documents"):
        self.client = chromadb.PersistentClient(path="./db")
        self.openai = OpenAI()
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )

    def add_documents(self, documents: List[Dict]):
        """Add documents with metadata."""
        texts = [doc["content"] for doc in documents]
        ids = [doc["id"] for doc in documents]
        metadatas = [doc.get("metadata", {}) for doc in documents]

        # Generate embeddings
        embeddings = self._get_embeddings(texts)

        self.collection.add(
            embeddings=embeddings,
            documents=texts,
            metadatas=metadatas,
            ids=ids
        )

    def search(self, query: str, n_results: int = 5,
               filter_metadata: Dict = None) -> List[Dict]:
        """Search for similar documents."""
        query_embedding = self._get_embeddings([query])[0]

        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results,
            where=filter_metadata
        )

        return [
            {
                "id": results["ids"][0][i],
                "content": results["documents"][0][i],
                "metadata": results["metadatas"][0][i],
                "distance": results["distances"][0][i]
            }
            for i in range(len(results["ids"][0]))
        ]

    def _get_embeddings(self, texts: List[str]) -> List[List[float]]:
        response = self.openai.embeddings.create(
            input=texts,
            model="text-embedding-3-small"
        )
        return [item.embedding for item in response.data]

# Usage
search = SemanticSearch()

# Add documents
search.add_documents([
    {"id": "1", "content": "Python is a programming language", "metadata": {"type": "tech"}},
    {"id": "2", "content": "Machine learning uses algorithms", "metadata": {"type": "ai"}},
    {"id": "3", "content": "Deep learning is a subset of ML", "metadata": {"type": "ai"}}
])

# Search
results = search.search("artificial intelligence", n_results=2)
for r in results:
    print(f"{r['content']} (score: {1-r['distance']:.4f})")

Comparison

DatabaseBest ForHosting
ChromaDBLocal development, prototypesSelf-hosted
PineconeProduction, managed serviceCloud
WeaviateHybrid search, GraphQLBoth
FAISSHigh performance, localSelf-hosted
MilvusLarge scale, enterpriseBoth

Summary

Vector databases are essential for:

  1. Semantic Search - Find similar content by meaning
  2. RAG Systems - Retrieve context for LLMs
  3. Recommendations - Similar items/users
  4. Deduplication - Find near-duplicates

Choose based on scale, hosting preferences, and feature requirements.

Advertisement

MR

Moshiour Rahman

Software Architect & AI Engineer

Share:
MR

Moshiour Rahman

Software Architect & AI Engineer

Enterprise software architect with deep expertise in financial systems, distributed architecture, and AI-powered applications. Building large-scale systems at Fortune 500 companies. Specializing in LLM orchestration, multi-agent systems, and cloud-native solutions. I share battle-tested patterns from real enterprise projects.

Related Articles

Comments

Comments are powered by GitHub Discussions.

Configure Giscus at giscus.app to enable comments.