Apache Cassandra

Distributed NoSQL database providing high availability and linear scalability. Specialized in big data processing. Distributed architecture without single point of failure.

Database ServerNoSQLDistributedColumn-orientedHigh AvailabilityBig DataScalableLinear Scalability

Database Server

Apache Cassandra

Overview

Apache Cassandra is an open-source NoSQL database designed for large-scale distributed systems. Originally developed at Facebook and now managed by the Apache Software Foundation, it delivers linear scalability and high availability with no single point of failure architecture, enabling mission-critical applications to handle massive amounts of data. Adopted by major companies like Netflix, Apple, Instagram, and Uber, Cassandra demonstrates exceptional strength in web-scale applications and time-series data processing. Its masterless architecture ensures all nodes are equal, providing continuous availability even during hardware failures.

Details

Apache Cassandra 2025 edition has established itself as the definitive distributed database solution through years of maturation. The latest 4.1 series introduces enhanced cluster monitoring through virtual tables, Storage Attached Indexing (SAI) for high-speed search capabilities, and improved compaction strategies with UCS (Unified Compaction Strategy). The CQL query language offers SQL-like syntax for easy adoption, while automatic data replication between nodes, Gossip protocol for failure detection, and tunable consistency provide performance-consistency trade-off controls. Storage Attached Indexing and Materialized Views enable complex query patterns, with significantly improved performance optimization in large-scale environments.

Key Features

  • Linear Scalability: Performance improves proportionally with node additions
  • High Availability: No single point of failure with automatic failover
  • Column-oriented Data Model: Flexible schema design and efficient data compression
  • Tunable Consistency: Selectable consistency levels based on use case requirements
  • Distributed Architecture: No master-slave configuration, all nodes are peers
  • Geographic Distribution: Cross-datacenter replication support

Pros and Cons

Pros

  • Proven performance and availability in large-scale distributed environments
  • Node-proportional scalability and elimination of single points of failure
  • Open source with rich ecosystem and strong community support
  • Standard support for geographic distribution and multi-datacenter configurations
  • Optimized for time-series data and real-time analytics
  • Battle-tested operational experience from major enterprises like Netflix and Uber

Cons

  • Complex distributed system requiring advanced expertise for operation and maintenance
  • Limited complex query capabilities such as JOINs and subqueries
  • Data model design heavily dependent on query patterns, making design changes difficult
  • High memory usage and demanding hardware resource requirements
  • Unsuitable for applications requiring strong consistency
  • Complex initial setup and cluster management

Reference Pages

Code Examples

Installation and Basic Setup

# Java 11+ installation (Ubuntu/Debian)
sudo apt update
sudo apt install openjdk-11-jdk

# Add Cassandra repository
curl https://downloads.apache.org/cassandra/KEYS | sudo apt-key add -
echo "deb https://debian.cassandra.apache.org 41x main" | sudo tee -a /etc/apt/sources.list.d/cassandra.sources.list

# Install Cassandra
sudo apt update
sudo apt install cassandra

# CentOS/RHEL/Fedora installation
sudo dnf install java-11-openjdk
sudo tee /etc/yum.repos.d/cassandra.repo << 'EOF'
[cassandra]
name=Apache Cassandra
baseurl=https://redhat.cassandra.apache.org/41x/
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://downloads.apache.org/cassandra/KEYS
EOF

sudo dnf install cassandra

# Docker execution
docker run --name cassandra-node1 \
  -p 9042:9042 \
  -d cassandra:4.1

# Multi-node cluster (Docker Compose)
cat > docker-compose.yml << 'EOF'
version: '3.8'
services:
  cassandra-node1:
    image: cassandra:4.1
    ports:
      - "9042:9042"
    environment:
      - CASSANDRA_CLUSTER_NAME=MyCluster
      - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch
      - CASSANDRA_DC=datacenter1
      - CASSANDRA_RACK=rack1
    volumes:
      - cassandra1-data:/var/lib/cassandra

  cassandra-node2:
    image: cassandra:4.1
    environment:
      - CASSANDRA_CLUSTER_NAME=MyCluster
      - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch
      - CASSANDRA_DC=datacenter1
      - CASSANDRA_RACK=rack1
      - CASSANDRA_SEEDS=cassandra-node1
    volumes:
      - cassandra2-data:/var/lib/cassandra
    depends_on:
      - cassandra-node1

  cassandra-node3:
    image: cassandra:4.1
    environment:
      - CASSANDRA_CLUSTER_NAME=MyCluster
      - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch
      - CASSANDRA_DC=datacenter1
      - CASSANDRA_RACK=rack2
      - CASSANDRA_SEEDS=cassandra-node1
    volumes:
      - cassandra3-data:/var/lib/cassandra
    depends_on:
      - cassandra-node1

volumes:
  cassandra1-data:
  cassandra2-data:
  cassandra3-data:
EOF

docker-compose up -d

# Service start and status check
sudo systemctl enable cassandra
sudo systemctl start cassandra
sudo systemctl status cassandra

# Check cluster status
nodetool status
nodetool info

Basic CQL Queries and Data Operations

-- Connect to CQLShell
cqlsh localhost 9042

-- Create keyspace (database equivalent)
CREATE KEYSPACE IF NOT EXISTS myapp
WITH REPLICATION = {
    'class': 'NetworkTopologyStrategy',
    'datacenter1': 3
};

-- Simple replication strategy (for development)
CREATE KEYSPACE IF NOT EXISTS myapp_dev
WITH REPLICATION = {
    'class': 'SimpleStrategy',
    'replication_factor': 1
};

USE myapp;

-- Create user table
CREATE TABLE IF NOT EXISTS users (
    user_id UUID PRIMARY KEY,
    username TEXT,
    email TEXT,
    first_name TEXT,
    last_name TEXT,
    created_at TIMESTAMP,
    updated_at TIMESTAMP,
    is_active BOOLEAN,
    profile MAP<TEXT, TEXT>
);

-- Create secondary indexes
CREATE INDEX IF NOT EXISTS users_username_idx ON users (username);
CREATE INDEX IF NOT EXISTS users_email_idx ON users (email);

-- Insert data
INSERT INTO users (user_id, username, email, first_name, last_name, created_at, updated_at, is_active, profile)
VALUES (uuid(), 'john_doe', '[email protected]', 'John', 'Doe', toTimestamp(now()), toTimestamp(now()), true, 
        {'bio': 'Software Engineer', 'location': 'Tokyo', 'company': 'Tech Corp'});

INSERT INTO users (user_id, username, email, first_name, last_name, created_at, updated_at, is_active, profile)
VALUES (uuid(), 'jane_smith', '[email protected]', 'Jane', 'Smith', toTimestamp(now()), toTimestamp(now()), true,
        {'bio': 'Data Scientist', 'location': 'Osaka', 'company': 'Analytics Inc'});

-- Data retrieval
SELECT * FROM users;
SELECT user_id, username, email FROM users WHERE username = 'john_doe';
SELECT * FROM users WHERE email = '[email protected]';

-- Composite primary key table (for time-series data)
CREATE TABLE IF NOT EXISTS user_activities (
    user_id UUID,
    activity_date DATE,
    activity_time TIMESTAMP,
    activity_type TEXT,
    details MAP<TEXT, TEXT>,
    metadata FROZEN<MAP<TEXT, TEXT>>,
    PRIMARY KEY ((user_id, activity_date), activity_time)
) WITH CLUSTERING ORDER BY (activity_time DESC);

-- Time-series data insertion
INSERT INTO user_activities (user_id, activity_date, activity_time, activity_type, details)
VALUES (uuid(), '2024-01-15', toTimestamp(now()), 'login', {'ip_address': '192.168.1.100', 'user_agent': 'Chrome'});

INSERT INTO user_activities (user_id, activity_date, activity_time, activity_type, details)
VALUES (uuid(), '2024-01-15', toTimestamp(now()), 'page_view', {'page': '/dashboard', 'referrer': '/login'});

-- Partition-scoped queries
SELECT * FROM user_activities 
WHERE user_id = uuid() AND activity_date = '2024-01-15'
ORDER BY activity_time DESC
LIMIT 10;

-- Batch statements for multiple operations
BEGIN BATCH
    INSERT INTO users (user_id, username, email, first_name, last_name, created_at, is_active)
    VALUES (uuid(), 'bob_wilson', '[email protected]', 'Bob', 'Wilson', toTimestamp(now()), true);
    
    INSERT INTO user_activities (user_id, activity_date, activity_time, activity_type, details)
    VALUES (uuid(), '2024-01-15', toTimestamp(now()), 'registration', {'method': 'email', 'source': 'website'});
APPLY BATCH;

-- TTL (Time To Live) usage
INSERT INTO users (user_id, username, email, first_name, last_name, created_at, is_active)
VALUES (uuid(), 'temp_user', '[email protected]', 'Temp', 'User', toTimestamp(now()), true)
USING TTL 3600; -- Auto-delete after 1 hour

-- Counter table
CREATE TABLE IF NOT EXISTS page_view_counts (
    page_url TEXT PRIMARY KEY,
    view_count COUNTER
);

UPDATE page_view_counts SET view_count = view_count + 1 WHERE page_url = '/dashboard';
UPDATE page_view_counts SET view_count = view_count + 5 WHERE page_url = '/profile';

SELECT * FROM page_view_counts;

Advanced Data Modeling and Query Optimization

-- Product catalog and review system
CREATE TABLE IF NOT EXISTS products (
    product_id UUID PRIMARY KEY,
    name TEXT,
    description TEXT,
    category TEXT,
    price DECIMAL,
    tags SET<TEXT>,
    specifications MAP<TEXT, TEXT>,
    created_at TIMESTAMP,
    updated_at TIMESTAMP
);

-- Product search by category table
CREATE TABLE IF NOT EXISTS products_by_category (
    category TEXT,
    product_id UUID,
    name TEXT,
    price DECIMAL,
    created_at TIMESTAMP,
    PRIMARY KEY (category, created_at, product_id)
) WITH CLUSTERING ORDER BY (created_at DESC, product_id ASC);

-- Product review table (composite partition for user and product)
CREATE TABLE IF NOT EXISTS product_reviews (
    product_id UUID,
    user_id UUID,
    review_id TIMEUUID,
    rating INT,
    title TEXT,
    content TEXT,
    helpful_votes INT,
    created_at TIMESTAMP,
    PRIMARY KEY ((product_id, user_id), review_id)
) WITH CLUSTERING ORDER BY (review_id DESC);

-- Product review summary table
CREATE TABLE IF NOT EXISTS product_review_summary (
    product_id UUID PRIMARY KEY,
    total_reviews COUNTER,
    total_rating COUNTER,
    five_star_count COUNTER,
    four_star_count COUNTER,
    three_star_count COUNTER,
    two_star_count COUNTER,
    one_star_count COUNTER
);

-- Data insertion and query patterns
-- Product data insertion
INSERT INTO products (product_id, name, description, category, price, tags, specifications, created_at, updated_at)
VALUES (
    uuid(),
    'Gaming Laptop Pro',
    'High-performance gaming laptop',
    'Electronics',
    1299.99,
    {'gaming', 'laptop', 'high-performance'},
    {'cpu': 'Intel i7-12700H', 'gpu': 'RTX 3070', 'memory': '32GB', 'storage': '1TB SSD'},
    toTimestamp(now()),
    toTimestamp(now())
);

-- Category-specific product table insertion (denormalized)
INSERT INTO products_by_category (category, product_id, name, price, created_at)
VALUES ('Electronics', uuid(), 'Gaming Laptop Pro', 1299.99, toTimestamp(now()));

-- Review insertion
INSERT INTO product_reviews (product_id, user_id, review_id, rating, title, content, helpful_votes, created_at)
VALUES (
    uuid(), -- product_id
    uuid(), -- user_id  
    now(),  -- review_id (TIMEUUID)
    5,
    'Excellent gaming laptop!',
    'Great performance for gaming and work. Highly recommended.',
    0,
    toTimestamp(now())
);

-- Review summary counter update
UPDATE product_review_summary 
SET total_reviews = total_reviews + 1, 
    total_rating = total_rating + 5,
    five_star_count = five_star_count + 1
WHERE product_id = uuid();

-- Efficient query patterns
-- Category-based product retrieval (partition-sorted)
SELECT * FROM products_by_category 
WHERE category = 'Electronics' 
LIMIT 20;

-- Specific product review retrieval
SELECT * FROM product_reviews 
WHERE product_id = uuid() AND user_id = uuid()
ORDER BY review_id DESC;

-- UDT (User Defined Type) usage
CREATE TYPE IF NOT EXISTS address (
    street TEXT,
    city TEXT,
    state TEXT,
    zip_code TEXT,
    country TEXT
);

CREATE TABLE IF NOT EXISTS users_with_address (
    user_id UUID PRIMARY KEY,
    username TEXT,
    email TEXT,
    home_address FROZEN<address>,
    work_address FROZEN<address>,
    created_at TIMESTAMP
);

-- UDT data insertion
INSERT INTO users_with_address (user_id, username, email, home_address, created_at)
VALUES (
    uuid(),
    'alice_johnson',
    '[email protected]',
    {street: '123 Main St', city: 'Tokyo', state: 'Tokyo', zip_code: '100-0001', country: 'Japan'},
    toTimestamp(now())
);

Replication Strategy and Cluster Management

# Cluster information check
nodetool status
nodetool info
nodetool describecluster
nodetool ring

# Node addition preparation
# New node cassandra.yaml configuration
sudo tee -a /etc/cassandra/cassandra.yaml << 'EOF'
cluster_name: 'MyProductionCluster'
num_tokens: 256
seed_provider:
  - class_name: org.apache.cassandra.locator.SimpleSeedProvider
    parameters:
      - seeds: "192.168.1.10,192.168.1.11,192.168.1.12"
listen_address: 192.168.1.15
rpc_address: 192.168.1.15
endpoint_snitch: GossipingPropertyFileSnitch
EOF

# Add new node to cluster
sudo systemctl start cassandra

# Monitor cluster status
nodetool status
nodetool netstats

# Datacenter configuration (cassandra-rackdc.properties)
sudo tee /etc/cassandra/cassandra-rackdc.properties << 'EOF'
dc=datacenter1
rack=rack1
prefer_local=true
EOF

# Multi-datacenter keyspace
CREATE KEYSPACE IF NOT EXISTS myapp_multidc
WITH REPLICATION = {
    'class': 'NetworkTopologyStrategy',
    'datacenter1': 3,
    'datacenter2': 2
};

# Check replication information
DESCRIBE KEYSPACE myapp_multidc;

# Node repair (regular maintenance)
nodetool repair -pr  # Primary range repair

# Specific keyspace repair
nodetool repair myapp

# Force compaction
nodetool compact myapp users

# Create snapshot
nodetool snapshot myapp --tag snapshot_$(date +%Y%m%d_%H%M%S)

# Check snapshots
nodetool listsnapshots

# Execute garbage collection
nodetool gc

# Cluster statistics
nodetool tablestats myapp.users
nodetool cfstats myapp.users

Storage Attached Index (SAI) and Search Features

-- Create SAI indexes (for high-speed search)
CREATE CUSTOM INDEX IF NOT EXISTS users_first_name_sai 
ON users (first_name) 
USING 'StorageAttachedIndex';

CREATE CUSTOM INDEX IF NOT EXISTS users_profile_sai 
ON users (profile) 
USING 'StorageAttachedIndex';

-- Range search-enabled SAI index
CREATE CUSTOM INDEX IF NOT EXISTS users_created_at_sai 
ON users (created_at) 
USING 'StorageAttachedIndex';

-- Text search SAI index
CREATE CUSTOM INDEX IF NOT EXISTS products_description_sai 
ON products (description) 
USING 'StorageAttachedIndex' 
WITH OPTIONS = {'similarity_function': 'cosine'};

-- SAI query examples
-- Partial match search
SELECT * FROM users WHERE first_name LIKE 'J%';

-- MAP type data search
SELECT * FROM users WHERE profile['location'] = 'Tokyo';

-- Range queries
SELECT * FROM users 
WHERE created_at >= '2024-01-01' AND created_at < '2024-02-01';

-- Composite conditions
SELECT * FROM users 
WHERE first_name = 'John' AND created_at >= '2024-01-01';

-- Text search
SELECT * FROM products 
WHERE description LIKE '%gaming%';

-- Materialized View (physical view)
CREATE MATERIALIZED VIEW IF NOT EXISTS users_by_email AS
SELECT user_id, username, email, first_name, last_name, created_at
FROM users
WHERE email IS NOT NULL AND user_id IS NOT NULL
PRIMARY KEY (email, user_id);

-- Query using Materialized View
SELECT * FROM users_by_email WHERE email = '[email protected]';

-- Aggregation functions and grouping
SELECT category, COUNT(*) as product_count
FROM products_by_category
WHERE category IN ('Electronics', 'Books', 'Clothing')
GROUP BY category;

Performance Monitoring and Optimization

# JMX metrics check
nodetool tpstats  # Thread pool statistics
nodetool compactionstats  # Compaction statistics
nodetool proxyhistograms  # Latency histograms

# Table statistics
nodetool tablestats myapp.users
nodetool tablehistograms myapp users

# Node performance monitoring
nodetool gossipinfo
nodetool getcompactionthreshold myapp users

# Configuration changes
nodetool setcompactionthreshold myapp users 4 32
nodetool sethintedhandoffthrottlekb 1024

# Log monitoring
sudo tail -f /var/log/cassandra/system.log
sudo tail -f /var/log/cassandra/debug.log

# GC statistics
nodetool gcstats

# Prometheus monitoring setup (JMX exporter)
cat > /opt/cassandra/conf/jmx_prometheus.yml << 'EOF'
jmxUrl: service:jmx:rmi:///jndi/rmi://localhost:7199/jmxrmi
ssl: false
lowercaseOutputName: false
lowercaseOutputLabelNames: false
whitelistObjectNames:
  - "org.apache.cassandra.metrics:type=ClientRequest,scope=Read,name=Latency"
  - "org.apache.cassandra.metrics:type=ClientRequest,scope=Write,name=Latency"
  - "org.apache.cassandra.metrics:type=Storage,name=Load"
  - "org.apache.cassandra.metrics:type=Storage,name=Exceptions"
rules:
  - pattern: 'org.apache.cassandra.metrics<type=(.+), name=(.+)><>Value'
    name: cassandra_$1_$2
    type: GAUGE
EOF

# Start JMX exporter
java -jar jmx_prometheus_javaagent.jar 8080:/opt/cassandra/conf/jmx_prometheus.yml

# Enable query tracing
TRACING ON;
SELECT * FROM users WHERE username = 'john_doe';
TRACING OFF;

# Slow query log configuration
ALTER TABLE system_views.slow_queries WITH gc_grace_seconds = 86400;

Application Integration and Best Practices

# Python (cassandra-driver) usage example
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.policies import DCAwareRoundRobinPolicy, TokenAwarePolicy
from cassandra.query import SimpleStatement, PreparedStatement
import uuid
from datetime import datetime

class CassandraConnection:
    def __init__(self, hosts=['127.0.0.1'], keyspace='myapp'):
        # Production environment recommended settings
        auth_provider = PlainTextAuthProvider(
            username='cassandra', 
            password='cassandra'
        )
        
        # Load balancing policy
        load_balancing_policy = TokenAwarePolicy(
            DCAwareRoundRobinPolicy(local_dc='datacenter1')
        )
        
        self.cluster = Cluster(
            hosts,
            auth_provider=auth_provider,
            load_balancing_policy=load_balancing_policy,
            port=9042,
            protocol_version=4
        )
        
        self.session = self.cluster.connect()
        self.session.set_keyspace(keyspace)
        
        # Prepare statements
        self.prepared_statements = self._prepare_statements()
    
    def _prepare_statements(self):
        return {
            'insert_user': self.session.prepare("""
                INSERT INTO users (user_id, username, email, first_name, last_name, created_at, is_active)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """),
            'get_user_by_username': self.session.prepare("""
                SELECT * FROM users WHERE username = ?
            """),
            'insert_activity': self.session.prepare("""
                INSERT INTO user_activities (user_id, activity_date, activity_time, activity_type, details)
                VALUES (?, ?, ?, ?, ?)
            """),
            'get_activities': self.session.prepare("""
                SELECT * FROM user_activities 
                WHERE user_id = ? AND activity_date = ?
                ORDER BY activity_time DESC
                LIMIT ?
            """)
        }
    
    def create_user(self, username, email, first_name, last_name):
        """Create user"""
        user_id = uuid.uuid4()
        
        self.session.execute(
            self.prepared_statements['insert_user'],
            [user_id, username, email, first_name, last_name, datetime.now(), True]
        )
        
        return user_id
    
    def get_user_by_username(self, username):
        """Get user by username"""
        result = self.session.execute(
            self.prepared_statements['get_user_by_username'],
            [username]
        )
        
        return result.one() if result else None
    
    def log_user_activity(self, user_id, activity_type, details=None):
        """Log user activity"""
        from datetime import date
        
        self.session.execute(
            self.prepared_statements['insert_activity'],
            [user_id, date.today(), datetime.now(), activity_type, details or {}]
        )
    
    def get_user_activities(self, user_id, activity_date, limit=50):
        """Get user activities"""
        result = self.session.execute(
            self.prepared_statements['get_activities'],
            [user_id, activity_date, limit]
        )
        
        return list(result)
    
    def batch_operations(self, operations):
        """Execute batch operations"""
        from cassandra.query import BatchStatement
        
        batch = BatchStatement()
        for statement, parameters in operations:
            batch.add(statement, parameters)
        
        self.session.execute(batch)
    
    def close(self):
        """Close connection"""
        self.cluster.shutdown()

# Usage example
def main():
    # Cassandra connection
    db = CassandraConnection(['192.168.1.10', '192.168.1.11', '192.168.1.12'])
    
    try:
        # User creation
        user_id = db.create_user(
            username='alice_cooper',
            email='[email protected]',
            first_name='Alice',
            last_name='Cooper'
        )
        print(f"Created user: {user_id}")
        
        # User retrieval
        user = db.get_user_by_username('alice_cooper')
        print(f"Retrieved user: {user}")
        
        # Activity logging
        db.log_user_activity(
            user_id=user_id,
            activity_type='login',
            details={'ip_address': '192.168.1.100', 'user_agent': 'Chrome/91.0'}
        )
        
        # Batch operation example
        from datetime import date
        batch_ops = [
            (db.prepared_statements['insert_activity'], 
             [user_id, date.today(), datetime.now(), 'page_view', {'page': '/dashboard'}]),
            (db.prepared_statements['insert_activity'], 
             [user_id, date.today(), datetime.now(), 'click', {'element': 'nav-profile'}])
        ]
        db.batch_operations(batch_ops)
        
        # Activity retrieval
        activities = db.get_user_activities(user_id, date.today())
        print(f"User activities: {len(activities)} records")
        
    finally:
        db.close()

if __name__ == "__main__":
    main()

# Spring Boot integration example (Java)
"""
// application.yml
spring:
  cassandra:
    contact-points: 192.168.1.10,192.168.1.11,192.168.1.12
    port: 9042
    keyspace-name: myapp
    username: cassandra
    password: cassandra
    local-datacenter: datacenter1
    request:
      timeout: 10s
      consistency: LOCAL_QUORUM

// CassandraConfig.java
@Configuration
@EnableCassandraRepositories
public class CassandraConfig extends AbstractCassandraConfiguration {
    
    @Override
    protected String getKeyspaceName() {
        return "myapp";
    }
    
    @Override
    protected String getContactPoints() {
        return "192.168.1.10,192.168.1.11,192.168.1.12";
    }
    
    @Override
    protected int getPort() {
        return 9042;
    }
    
    @Override
    protected String getLocalDataCenter() {
        return "datacenter1";
    }
}

// User.java (Entity)
@Table("users")
public class User {
    @PrimaryKey
    private UUID userId;
    
    @Column("username")
    private String username;
    
    @Column("email") 
    private String email;
    
    @Column("first_name")
    private String firstName;
    
    @Column("last_name")
    private String lastName;
    
    @Column("created_at")
    private Instant createdAt;
    
    @Column("is_active")
    private Boolean isActive;
    
    // getters and setters
}

// UserRepository.java
@Repository
public interface UserRepository extends CassandraRepository<User, UUID> {
    @Query("SELECT * FROM users WHERE username = ?0")
    Optional<User> findByUsername(String username);
    
    @Query("SELECT * FROM users WHERE email = ?0")
    Optional<User> findByEmail(String email);
}
"""