GitHub Overview

duckdb/duckdb

DuckDB is an analytical in-process SQL database management system

Stars31,277
Watchers229
Forks2,471
Created:June 26, 2018
Language:C++
License:MIT License

Topics

analyticsdatabaseembedded-databaseolapsql

Star History

duckdb/duckdb Star History
Data as of: 7/30/2025, 02:37 AM

Database

DuckDB + Vector Search

Overview

DuckDB is an in-process SQL OLAP (analytical processing) database that is embedded like SQLite but optimized for analytical workloads. With the VSS (Vector Similarity Search) extension, vector search capabilities are added, enabling fast local vector search combined with analytical queries.

Details

DuckDB development began in 2018 at CWI (Centrum Wiskunde & Informatica) and is now led by DuckDB Labs. With its columnar storage engine and advanced query optimization, it delivers excellent performance for analytical queries. The VSS extension adds vector search functionality, allowing integration of data analysis and AI workloads.

Key features of DuckDB + vector search:

  • Vector search in an embedded database
  • HNSW (Hierarchical Navigable Small World) index
  • Integration with fast analytical queries
  • Memory-efficient processing
  • Parallel processing and vectorized execution
  • Direct integration with Python, R, Java
  • Zero-copy data sharing
  • Direct reading of Parquet, CSV, JSON
  • Simple SQL operations
  • Lightweight (few MB) binary

Architecture Features

  • Columnar storage
  • Vectorized query execution engine
  • Parallel processing and multi-core optimization
  • Pushdown optimization

Pros and Cons

Pros

  • Easy deployment: Serverless embedded database
  • Fast analytics: Excellent performance with OLAP optimization
  • Low latency: Minimal overhead with in-process execution
  • Rich features: Full SQL functionality and window functions
  • Easy integration: Direct bindings to Python, R, Java
  • Cost-effective: Open source and free

Cons

  • Scalability: Limited to single machine
  • Concurrency: Single writer process only
  • Persistence: Not suitable as primary storage
  • Vector feature limitations: Limited features compared to dedicated vector DBs
  • Memory limitations: Potential memory shortage with large datasets

Key Links

Usage Examples

Setup and Installation

import duckdb
import numpy as np
import pandas as pd

# DuckDB connection and VSS extension installation
conn = duckdb.connect(':memory:')

# Install VSS extension
conn.execute("INSTALL vss;")
conn.execute("LOAD vss;")

# Create table
conn.execute("""
    CREATE TABLE documents (
        id INTEGER PRIMARY KEY,
        title VARCHAR,
        content TEXT,
        embedding FLOAT[768],
        category VARCHAR,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
""")

Vector Data Insertion and Search

# Insert document
def insert_document(conn, title, content, embedding):
    conn.execute("""
        INSERT INTO documents (title, content, embedding, category)
        VALUES (?, ?, ?, ?)
    """, [title, content, embedding.tolist(), 'technology'])

# Insert sample data
embedding = np.random.rand(768).astype(np.float32)
insert_document(
    conn,
    "DuckDB Vector Search",
    "Fast vector search in embedded database",
    embedding
)

# Create HNSW index
conn.execute("""
    CREATE INDEX idx_embedding ON documents 
    USING HNSW (embedding) 
    WITH (metric = 'cosine', ef_construction = 128, M = 16)
""")

# Vector search
def vector_search(conn, query_vector, limit=10):
    result = conn.execute("""
        SELECT 
            id, 
            title, 
            content,
            array_cosine_similarity(embedding, ?::FLOAT[768]) as similarity
        FROM documents
        ORDER BY array_cosine_similarity(embedding, ?::FLOAT[768]) DESC
        LIMIT ?
    """, [query_vector.tolist(), query_vector.tolist(), limit])
    
    return result.fetchdf()

# Execute search
query_embedding = np.random.rand(768).astype(np.float32)
results = vector_search(conn, query_embedding)
print(results)

Integration with Analytical Queries

# Prepare analytical data
conn.execute("""
    CREATE TABLE analytics_data AS
    SELECT 
        d.*,
        EXTRACT(MONTH FROM created_at) as month,
        LENGTH(content) as content_length
    FROM documents d
""")

# Combined vector search and analytics
def analytical_vector_search(conn, query_vector, category=None):
    query = """
        WITH vector_results AS (
            SELECT 
                id,
                title,
                category,
                array_cosine_similarity(embedding, ?::FLOAT[768]) as similarity
            FROM documents
            WHERE array_cosine_similarity(embedding, ?::FLOAT[768]) > 0.8
        )
        SELECT 
            vr.category,
            COUNT(*) as count,
            AVG(vr.similarity) as avg_similarity,
            MIN(vr.similarity) as min_similarity,
            MAX(vr.similarity) as max_similarity
        FROM vector_results vr
        GROUP BY vr.category
        ORDER BY avg_similarity DESC
    """
    
    return conn.execute(
        query, 
        [query_vector.tolist(), query_vector.tolist()]
    ).fetchdf()

# Advanced analytics with window functions
def trending_similar_documents(conn, query_vector):
    query = """
        WITH ranked_docs AS (
            SELECT 
                id,
                title,
                created_at,
                array_cosine_similarity(embedding, ?::FLOAT[768]) as similarity,
                ROW_NUMBER() OVER (
                    PARTITION BY DATE_TRUNC('day', created_at) 
                    ORDER BY array_cosine_similarity(embedding, ?::FLOAT[768]) DESC
                ) as daily_rank
            FROM documents
            WHERE created_at >= CURRENT_DATE - INTERVAL 30 DAY
        )
        SELECT 
            DATE_TRUNC('day', created_at) as date,
            title,
            similarity,
            daily_rank
        FROM ranked_docs
        WHERE daily_rank <= 5
        ORDER BY date DESC, daily_rank
    """
    
    return conn.execute(
        query,
        [query_vector.tolist(), query_vector.tolist()]
    ).fetchdf()

Batch Processing and Optimization

# Batch insert from Pandas DataFrame
def batch_insert_from_dataframe(conn, df):
    # Create DuckDB table directly from DataFrame
    conn.register('temp_df', df)
    
    conn.execute("""
        INSERT INTO documents (title, content, embedding, category)
        SELECT 
            title,
            content,
            embedding::FLOAT[768],
            category
        FROM temp_df
    """)

# Direct vector search from Parquet file
def search_from_parquet(conn, parquet_file, query_vector):
    query = """
        SELECT 
            title,
            content,
            array_cosine_similarity(embedding::FLOAT[768], ?::FLOAT[768]) as similarity
        FROM read_parquet(?)
        WHERE array_cosine_similarity(embedding::FLOAT[768], ?::FLOAT[768]) > 0.7
        ORDER BY similarity DESC
        LIMIT 10
    """
    
    return conn.execute(
        query,
        [query_vector.tolist(), parquet_file, query_vector.tolist()]
    ).fetchdf()

# Memory usage optimization
def optimize_memory_usage(conn):
    # Set memory limit
    conn.execute("SET memory_limit='2GB';")
    
    # Set temporary file directory
    conn.execute("SET temp_directory='/tmp/duckdb';")
    
    # Set worker thread count
    conn.execute("SET threads=4;")