Database

Amazon Redshift

Overview

Amazon Redshift is a fully managed cloud data warehouse service provided by AWS. It supports petabyte-scale data analytics and offers easy integration with existing SQL tools through PostgreSQL compatibility. Using a columnar storage architecture, it delivers high-speed analytical queries on large datasets.

Details

Amazon Redshift is an OLAP (Online Analytical Processing) specialized data warehouse based on PostgreSQL 8.0.2. Unlike traditional OLTP databases, it uses columnar storage optimized for analytical workloads, achieving high performance through data compression and parallel processing.

Key features of Amazon Redshift:

  • PostgreSQL 8.0.2-based SQL compatibility
  • Columnar storage engine
  • Automatic compression and encoding optimization
  • Parallel distributed processing architecture
  • AWS managed service
  • Petabyte-scale scalability
  • Seamless Amazon S3 integration
  • Redshift Spectrum (data lake integration)
  • RA3 instances (compute and storage separation)
  • Zero-ETL integration (Aurora PostgreSQL connectivity)
  • Machine learning integration (SageMaker)
  • Automatic backup and point-in-time recovery

Advantages & Disadvantages

Advantages

  • Fully Managed: Focus on data analytics without infrastructure management
  • High-Speed Analytics: Fast query execution with columnar storage
  • PostgreSQL Compatible: Leverage existing SQL knowledge and tools
  • AWS Ecosystem: Integration with S3, Firehose, Lambda, etc.
  • Auto Scaling: Flexible scaling based on data volume
  • Cost Efficient: Pay-per-use and concurrency scaling
  • High Availability: Automatic backup and multi-AZ support
  • Security: Data encryption and VPC integration
  • Zero-ETL: Real-time connectivity with Aurora
  • Machine Learning: Integration with Amazon SageMaker

Disadvantages

  • OLTP Unsuitable: Not optimized for transaction processing
  • PostgreSQL Limitations: Based on older PostgreSQL 8.0.2 with feature limitations
  • Vendor Lock-in: AWS-specific service
  • Cost Complexity: Multiple billing components require understanding
  • Update Limitations: Frequent updates/deletes are inefficient
  • Concurrent Connection Limits: Limited high concurrent user support
  • Data Load Time: Loading large datasets takes time

Key Links

Code Examples

Cluster Creation & Setup

# Create Redshift cluster with AWS CLI
aws redshift create-cluster \
  --cluster-identifier my-redshift-cluster \
  --node-type dc2.large \
  --master-username admin \
  --master-user-password MyPassword123 \
  --db-name analytics \
  --cluster-type single-node \
  --publicly-accessible

# Check cluster status
aws redshift describe-clusters \
  --cluster-identifier my-redshift-cluster

# Create Redshift Serverless workgroup
aws redshift-serverless create-workgroup \
  --workgroup-name my-workgroup \
  --namespace-name default \
  --base-capacity 8 \
  --enhanced-vpc-routing

# Connect with psql client
export PGHOST=my-redshift-cluster.abcdefg.us-east-1.redshift.amazonaws.com
export PGPORT=5439
export PGDATABASE=analytics
export PGUSER=admin
export PGPASSWORD=MyPassword123

psql

# JDBC connection string
jdbc:redshift://my-redshift-cluster.abcdefg.us-east-1.redshift.amazonaws.com:5439/analytics

Basic Operations (DDL/DML)

-- Create database and schema
CREATE DATABASE sales_analytics;
CREATE SCHEMA ecommerce;

-- Create table with compression encoding
CREATE TABLE ecommerce.customer (
    customer_id INTEGER ENCODE delta,
    first_name VARCHAR(50) ENCODE lzo,
    last_name VARCHAR(50) ENCODE lzo,
    email VARCHAR(100) ENCODE lzo,
    phone VARCHAR(20) ENCODE bytedict,
    country_code CHAR(2) ENCODE raw,
    signup_date DATE ENCODE delta32k,
    lifetime_value DECIMAL(10,2) ENCODE az64,
    last_login TIMESTAMP ENCODE az64
)
DISTSTYLE KEY
DISTKEY (customer_id)
SORTKEY (signup_date);

-- Sales data table
CREATE TABLE ecommerce.sales (
    sale_id BIGINT IDENTITY(1,1),
    customer_id INTEGER,
    product_id INTEGER,
    sale_date DATE,
    quantity INTEGER,
    unit_price DECIMAL(10,2),
    total_amount DECIMAL(12,2),
    discount_percent DECIMAL(3,2)
)
DISTSTYLE KEY
DISTKEY (customer_id)
SORTKEY (sale_date);

-- External table (Redshift Spectrum)
CREATE EXTERNAL SCHEMA ecommerce_s3
FROM DATA CATALOG
DATABASE 'sales_catalog'
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftSpectrumRole'
CREATE EXTERNAL DATABASE IF NOT EXISTS;

CREATE EXTERNAL TABLE ecommerce_s3.web_logs (
    log_date DATE,
    user_id INTEGER,
    session_id VARCHAR(50),
    page_url VARCHAR(500),
    user_agent VARCHAR(500),
    ip_address VARCHAR(15)
)
STORED AS PARQUET
LOCATION 's3://my-bucket/web-logs/'
TABLE PROPERTIES ('has_encrypted_data'='false');

Data Loading & COPY

-- Load data from S3 (with automatic compression)
COPY ecommerce.customer 
FROM 's3://my-bucket/customer-data/'
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftRole'
CSV
IGNOREHEADER 1
COMPUPDATE ON
STATUPDATE ON;

-- Load JSON format data
COPY ecommerce.customer_events
FROM 's3://my-bucket/events/2024/01/'
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftRole'
JSON 'auto'
TIMEFORMAT 'YYYY-MM-DDTHH:MI:SS'
GZIP;

-- Load from compressed files
COPY ecommerce.sales
FROM 's3://my-bucket/sales/sales_2024.csv.gz'
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftRole'
CSV
DELIMITER ','
IGNOREHEADER 1
GZIP;

-- Parallel loading from multiple files
COPY ecommerce.sales
FROM 's3://my-bucket/sales/2024/'
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftRole'
CSV
MANIFEST;

-- Loading with error handling
COPY ecommerce.customer
FROM 's3://my-bucket/customer-data/'
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftRole'
CSV
MAXERROR 100
NOLOAD;

-- Check load statistics
SELECT query, filename, line_number, colname, type, position, raw_field_value, err_reason
FROM stl_load_errors
WHERE query = pg_last_copy_id()
ORDER BY query DESC;

Queries & Analytics

-- Basic aggregation query
SELECT 
    DATE_TRUNC('month', sale_date) AS month,
    COUNT(*) AS total_sales,
    COUNT(DISTINCT customer_id) AS unique_customers,
    SUM(total_amount) AS revenue,
    AVG(total_amount) AS avg_order_value,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_amount) AS median_order_value
FROM ecommerce.sales
WHERE sale_date >= '2024-01-01'
GROUP BY 1
ORDER BY 1;

-- Window function analysis
SELECT 
    customer_id,
    sale_date,
    total_amount,
    ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY sale_date) AS order_sequence,
    LAG(total_amount) OVER (PARTITION BY customer_id ORDER BY sale_date) AS prev_order_amount,
    SUM(total_amount) OVER (PARTITION BY customer_id ORDER BY sale_date 
                           ROWS UNBOUNDED PRECEDING) AS cumulative_spending
FROM ecommerce.sales
WHERE customer_id IN (SELECT TOP 100 customer_id FROM ecommerce.customer);

-- Customer segmentation analysis
WITH customer_metrics AS (
    SELECT 
        c.customer_id,
        c.signup_date,
        COUNT(s.sale_id) AS order_count,
        SUM(s.total_amount) AS total_spent,
        MAX(s.sale_date) AS last_order_date,
        DATEDIFF(day, c.signup_date, MAX(s.sale_date)) AS customer_lifespan_days
    FROM ecommerce.customer c
    LEFT JOIN ecommerce.sales s ON c.customer_id = s.customer_id
    GROUP BY c.customer_id, c.signup_date
)
SELECT 
    CASE 
        WHEN total_spent >= 1000 AND order_count >= 10 THEN 'VIP'
        WHEN total_spent >= 500 AND order_count >= 5 THEN 'High Value'
        WHEN total_spent >= 100 AND order_count >= 2 THEN 'Regular'
        ELSE 'Low Value'
    END AS customer_segment,
    COUNT(*) AS customer_count,
    AVG(total_spent) AS avg_spending,
    AVG(order_count) AS avg_orders
FROM customer_metrics
GROUP BY 1
ORDER BY avg_spending DESC;

-- Time series analysis (moving average)
SELECT 
    sale_date,
    SUM(total_amount) AS daily_revenue,
    AVG(SUM(total_amount)) OVER (ORDER BY sale_date 
                                ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS weekly_avg,
    AVG(SUM(total_amount)) OVER (ORDER BY sale_date 
                                ROWS BETWEEN 29 PRECEDING AND CURRENT ROW) AS monthly_avg
FROM ecommerce.sales
WHERE sale_date >= '2024-01-01'
GROUP BY sale_date
ORDER BY sale_date;

Advanced Analytics Features

-- Machine learning with Redshift ML
CREATE MODEL customer_lifetime_value_model
FROM (
    SELECT 
        customer_id,
        EXTRACT(year FROM signup_date) AS signup_year,
        EXTRACT(month FROM signup_date) AS signup_month,
        COUNT(sale_id) AS order_count,
        AVG(total_amount) AS avg_order_value,
        SUM(total_amount) AS total_spent
    FROM ecommerce.customer c
    JOIN ecommerce.sales s ON c.customer_id = s.customer_id
    WHERE signup_date >= '2023-01-01'
    GROUP BY c.customer_id, signup_date
)
TARGET total_spent
FUNCTION ml_fn_customer_ltv
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftMLRole'
SETTINGS (
    S3_BUCKET 'my-ml-bucket',
    MAX_RUNTIME 5400
);

-- Prediction query
SELECT 
    customer_id,
    ml_fn_customer_ltv(signup_year, signup_month, order_count, avg_order_value) AS predicted_ltv
FROM (
    SELECT 
        customer_id,
        EXTRACT(year FROM signup_date) AS signup_year,
        EXTRACT(month FROM signup_date) AS signup_month,
        COUNT(sale_id) AS order_count,
        AVG(total_amount) AS avg_order_value
    FROM ecommerce.customer c
    LEFT JOIN ecommerce.sales s ON c.customer_id = s.customer_id
    WHERE signup_date >= '2024-01-01'
    GROUP BY c.customer_id, signup_date
)
ORDER BY predicted_ltv DESC
LIMIT 100;

-- Spatial data analysis (PostGIS functions)
SELECT 
    customer_id,
    ST_Distance(
        ST_GeomFromText('POINT(' || longitude || ' ' || latitude || ')'),
        ST_GeomFromText('POINT(-73.935242 40.730610)')  -- NYC coordinates
    ) AS distance_from_nyc_km
FROM ecommerce.customer_locations
WHERE longitude IS NOT NULL AND latitude IS NOT NULL;

-- Array operations and JSON processing
SELECT 
    customer_id,
    JSON_EXTRACT_PATH_TEXT(preferences, 'favorite_categories') AS favorite_categories,
    JSON_EXTRACT_ARRAY_ELEMENT_TEXT(
        JSON_EXTRACT_PATH_TEXT(preferences, 'favorite_categories'), 0
    ) AS top_category
FROM ecommerce.customer_preferences
WHERE preferences IS NOT NULL;

Performance Optimization

-- Update table statistics
ANALYZE ecommerce.sales;
ANALYZE ecommerce.customer;

-- VACUUM for space reclamation
VACUUM DELETE ONLY ecommerce.sales;
VACUUM REINDEX ecommerce.customer;

-- Compression encoding analysis
SELECT 
    "table", 
    "column", 
    type, 
    encoding,
    distkey,
    sortkey,
    "notnull"
FROM pg_table_def 
WHERE schemaname = 'ecommerce'
ORDER BY "table", "column";

-- Check disk usage
SELECT 
    schema AS table_schema,
    "table" AS table_name,
    size AS size_in_mb,
    tbl_rows AS row_count,
    (size::float / tbl_rows::float) AS avg_row_size_bytes
FROM SVV_TABLE_INFO
WHERE schema = 'ecommerce'
ORDER BY size DESC;

-- Query execution plan
EXPLAIN
SELECT 
    c.customer_id,
    SUM(s.total_amount) AS total_spent
FROM ecommerce.customer c
JOIN ecommerce.sales s ON c.customer_id = s.customer_id
WHERE c.signup_date >= '2024-01-01'
GROUP BY c.customer_id
ORDER BY total_spent DESC
LIMIT 100;

-- Create materialized view
CREATE MATERIALIZED VIEW ecommerce.monthly_sales_summary AS
SELECT 
    DATE_TRUNC('month', sale_date) AS month,
    customer_id,
    COUNT(*) AS order_count,
    SUM(total_amount) AS total_spent,
    AVG(total_amount) AS avg_order_value
FROM ecommerce.sales
GROUP BY 1, 2;

-- Auto refresh setting
ALTER MATERIALIZED VIEW ecommerce.monthly_sales_summary AUTO REFRESH YES;

Data Integration & ETL

-- Aurora PostgreSQL Zero-ETL integration
-- Integration setup (execute via AWS Console/CLI)
-- aws rds create-db-cluster-zero-etl-integration \
--   --source-arn arn:aws:rds:us-east-1:123456789012:cluster:aurora-postgres-cluster \
--   --target-arn arn:aws:redshift:us-east-1:123456789012:cluster:redshift-cluster

-- Check Zero-ETL integration data
SELECT schemaname, tablename, owner
FROM pg_tables
WHERE schemaname LIKE 'aurora_%'
ORDER BY schemaname, tablename;

-- Federated Query (external database connectivity)
CREATE EXTERNAL SCHEMA postgres_federation
FROM POSTGRES
DATABASE 'production_db'
URI 'postgres-instance.abcdefg.us-east-1.rds.amazonaws.com'
PORT 5432
USER 'federated_user'
PASSWORD 'federated_password'
IAM_ROLE 'arn:aws:iam::123456789012:role/RedshiftFederatedRole';

-- Execute federated query
SELECT 
    r.customer_id,
    r.total_spent,
    p.recent_activity
FROM ecommerce.customer_summary r
JOIN postgres_federation.user_activity p ON r.customer_id = p.user_id
WHERE r.total_spent > 1000;

-- Data API for external access
-- Python example
import boto3

redshift_data = boto3.client('redshift-data')

response = redshift_data.execute_statement(
    ClusterIdentifier='my-redshift-cluster',
    Database='analytics',
    DbUser='admin',
    Sql='SELECT COUNT(*) FROM ecommerce.sales WHERE sale_date = CURRENT_DATE'
)

# Get results
result = redshift_data.get_statement_result(Id=response['Id'])

Practical Examples

-- Real-time dashboard query
WITH daily_metrics AS (
    SELECT 
        CURRENT_DATE AS metric_date,
        COUNT(*) AS todays_orders,
        SUM(total_amount) AS todays_revenue,
        COUNT(DISTINCT customer_id) AS active_customers,
        AVG(total_amount) AS avg_order_value
    FROM ecommerce.sales
    WHERE sale_date = CURRENT_DATE
),
comparison_metrics AS (
    SELECT 
        COUNT(*) AS yesterday_orders,
        SUM(total_amount) AS yesterday_revenue
    FROM ecommerce.sales
    WHERE sale_date = CURRENT_DATE - 1
)
SELECT 
    d.*,
    c.yesterday_orders,
    c.yesterday_revenue,
    ROUND(((d.todays_revenue - c.yesterday_revenue) / c.yesterday_revenue) * 100, 2) AS revenue_growth_pct,
    ROUND(((d.todays_orders - c.yesterday_orders) / c.yesterday_orders) * 100, 2) AS order_growth_pct
FROM daily_metrics d
CROSS JOIN comparison_metrics c;

-- Product recommendation system (collaborative filtering)
WITH user_product_matrix AS (
    SELECT 
        customer_id,
        product_id,
        SUM(quantity) AS total_purchased
    FROM ecommerce.sales
    GROUP BY customer_id, product_id
),
similar_customers AS (
    SELECT 
        u1.customer_id AS customer_a,
        u2.customer_id AS customer_b,
        COUNT(*) AS common_products,
        CORR(u1.total_purchased, u2.total_purchased) AS similarity_score
    FROM user_product_matrix u1
    JOIN user_product_matrix u2 ON u1.product_id = u2.product_id 
                                AND u1.customer_id != u2.customer_id
    GROUP BY u1.customer_id, u2.customer_id
    HAVING COUNT(*) >= 3
)
SELECT 
    sc.customer_a AS target_customer,
    upm.product_id AS recommended_product,
    AVG(sc.similarity_score) AS recommendation_score
FROM similar_customers sc
JOIN user_product_matrix upm ON sc.customer_b = upm.customer_id
LEFT JOIN user_product_matrix existing ON sc.customer_a = existing.customer_id 
                                       AND upm.product_id = existing.product_id
WHERE existing.product_id IS NULL  -- Unpurchased products only
  AND sc.similarity_score > 0.7
GROUP BY sc.customer_a, upm.product_id
ORDER BY sc.customer_a, recommendation_score DESC;

-- Anomaly detection (sales spike detection)
WITH daily_sales AS (
    SELECT 
        sale_date,
        SUM(total_amount) AS daily_revenue
    FROM ecommerce.sales
    WHERE sale_date >= CURRENT_DATE - 90
    GROUP BY sale_date
),
sales_with_stats AS (
    SELECT 
        sale_date,
        daily_revenue,
        AVG(daily_revenue) OVER (ORDER BY sale_date ROWS BETWEEN 13 PRECEDING AND 1 PRECEDING) AS avg_14day,
        STDDEV(daily_revenue) OVER (ORDER BY sale_date ROWS BETWEEN 13 PRECEDING AND 1 PRECEDING) AS stddev_14day
    FROM daily_sales
)
SELECT 
    sale_date,
    daily_revenue,
    avg_14day,
    ROUND((daily_revenue - avg_14day) / stddev_14day, 2) AS z_score,
    CASE 
        WHEN ABS((daily_revenue - avg_14day) / stddev_14day) > 2.5 THEN 'ANOMALY'
        WHEN ABS((daily_revenue - avg_14day) / stddev_14day) > 1.5 THEN 'WARNING'
        ELSE 'NORMAL'
    END AS status
FROM sales_with_stats
WHERE stddev_14day > 0
  AND sale_date >= CURRENT_DATE - 30
ORDER BY sale_date DESC;

System Management & Monitoring

-- Check cluster information
SELECT 
    node,
    slice,
    disk_space_used_mb,
    disk_space_total_mb,
    disk_space_used_mb::float / disk_space_total_mb::float * 100 AS disk_usage_pct
FROM stv_partitions
ORDER BY disk_usage_pct DESC;

-- Monitor running queries
SELECT 
    query,
    pid,
    userid,
    query_start,
    substring(querytxt, 1, 100) AS query_text,
    elapsed_time,
    cpu_time
FROM stv_recents
WHERE status = 'Running'
ORDER BY elapsed_time DESC;

-- Check long-running queries
SELECT 
    query,
    userid,
    query_start,
    DATEDIFF(seconds, query_start, GETDATE()) AS runtime_seconds,
    substring(querytxt, 1, 200) AS query_text
FROM stl_query
WHERE query_start >= DATEADD(hour, -1, GETDATE())
  AND DATEDIFF(seconds, query_start, endtime) > 300
ORDER BY runtime_seconds DESC;

-- Check table locks
SELECT 
    lock_owner,
    lock_mode,
    table_id,
    granted
FROM stv_locks
WHERE granted = 'f'
ORDER BY lock_owner;

-- Check data distribution
SELECT 
    slice,
    COUNT(*) AS row_count
FROM ecommerce.sales
GROUP BY slice
ORDER BY slice;

-- Workload Management (WLM) configuration check
SELECT 
    service_class,
    num_query_tasks,
    query_working_mem,
    query_temp_blocks_to_disk,
    total_queue_time
FROM stl_wlm_query
WHERE service_class > 4
  AND query_start >= DATEADD(hour, -1, GETDATE())
ORDER BY total_queue_time DESC;

-- Check snapshots and backups
SELECT 
    snapshot_id,
    cluster_identifier,
    snapshot_type,
    status,
    start_time,
    end_time
FROM pg_snapshot
ORDER BY start_time DESC
LIMIT 10;