Database
Snowflake
Overview
Snowflake is a cloud-native data warehouse platform that provides a complete data platform delivered as-a-service. With its unique architecture that separates storage and compute, it enables independent scaling and cost optimization. As a fully managed service, it's available across major cloud providers including AWS, Azure, and GCP.
Details
Founded in 2012, Snowflake was designed to solve the challenges of traditional data warehouses. Its unique multi-cluster architecture completely separates storage, compute, and cloud services, allowing each layer to scale independently. The platform emphasizes data sharing, security, and governance while supporting enterprise-level data analytics workloads.
Key Snowflake features:
- Storage and compute separation architecture
- Multi-cluster virtual warehouses
- Automatic scaling and optimization
- Standard SQL support
- ACID compliant transactions
- Data cloning and time travel
- Secure data sharing
- Zero-copy cloning
- Multi-cloud support
- Fully managed service
Pros and Cons
Pros
- Cost Efficiency: Pay-per-use billing with automatic suspension
- Scalability: Independent storage and compute scaling
- High Performance: Automatic optimization and materialized views
- Ease of Use: Fully managed, no infrastructure management
- Data Sharing: Secure data sharing and marketplace
- Multi-Cloud: AWS, Azure, GCP support
- Security: Encryption, access control, audit features
Cons
- Vendor Lock-in: Dependency on Snowflake-specific features
- Cost Prediction: Usage-based billing complicates budget management
- Real-time Limitations: Constraints for streaming analytics
- Customization Limits: Limited infrastructure configuration freedom
- Data Migration: High initial cost for large data migration
Key Links
Code Examples
Installation and Setup
-- Connect to Snowflake (Web UI)
-- https://[account_identifier].snowflakecomputing.com/
-- Install SnowSQL CLI (macOS)
brew install snowflake-snowsql
-- Install SnowSQL CLI (Windows)
-- Download from https://developers.snowflake.com/snowsql/
-- SnowSQL config file (~/.snowsql/config)
[connections]
accountname = your_account_identifier
username = your_username
password = your_password
dbname = your_default_database
schemaname = your_default_schema
warehousename = your_default_warehouse
-- Connect with SnowSQL
snowsql -a your_account_identifier -u your_username
-- Python connector
pip install snowflake-connector-python
-- JDBC Driver
-- https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/
Basic Operations (DDL/DML)
-- Create database
CREATE DATABASE analytics_db;
USE DATABASE analytics_db;
-- Create schema
CREATE SCHEMA sales_data;
USE SCHEMA sales_data;
-- Create warehouse
CREATE WAREHOUSE compute_wh
WITH WAREHOUSE_SIZE = 'MEDIUM'
AUTO_SUSPEND = 60 -- Auto-suspend after 1 minute
AUTO_RESUME = TRUE
INITIALLY_SUSPENDED = TRUE;
-- Use warehouse
USE WAREHOUSE compute_wh;
-- Create table
CREATE TABLE sales_transactions (
transaction_id VARCHAR(50),
customer_id VARCHAR(50),
product_id VARCHAR(50),
transaction_date DATE,
transaction_timestamp TIMESTAMP_NTZ,
amount DECIMAL(10,2),
quantity INTEGER,
category VARCHAR(100),
region VARCHAR(50),
payment_method VARCHAR(50)
);
-- Insert data
INSERT INTO sales_transactions VALUES
('T001', 'C001', 'P001', '2024-01-15', '2024-01-15 10:30:00', 299.99, 2, 'Electronics', 'North', 'Credit Card'),
('T002', 'C002', 'P002', '2024-01-15', '2024-01-15 11:45:00', 49.99, 1, 'Books', 'South', 'PayPal'),
('T003', 'C001', 'P003', '2024-01-16', '2024-01-16 09:15:00', 149.99, 1, 'Clothing', 'North', 'Credit Card');
-- Bulk load from CSV file
CREATE STAGE my_stage
FILE_FORMAT = (TYPE = 'CSV' FIELD_DELIMITER = ',' SKIP_HEADER = 1);
-- Load data from S3
COPY INTO sales_transactions
FROM @my_stage/sales_data.csv
FILE_FORMAT = (TYPE = 'CSV' FIELD_DELIMITER = ',' SKIP_HEADER = 1);
-- Update data
UPDATE sales_transactions
SET amount = amount * 1.1
WHERE category = 'Electronics';
-- Delete data
DELETE FROM sales_transactions
WHERE transaction_date < '2024-01-01';
Query and Analysis
-- Basic aggregation query
SELECT
region,
category,
COUNT(*) AS transaction_count,
SUM(amount) AS total_revenue,
AVG(amount) AS avg_transaction_value,
SUM(quantity) AS total_quantity
FROM sales_transactions
WHERE transaction_date >= '2024-01-01'
GROUP BY region, category
ORDER BY total_revenue DESC;
-- Time series analysis
SELECT
DATE_TRUNC('month', transaction_date) AS month,
SUM(amount) AS monthly_revenue,
COUNT(DISTINCT customer_id) AS unique_customers,
AVG(amount) AS avg_order_value,
LAG(SUM(amount)) OVER (ORDER BY month) AS prev_month_revenue,
(SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY month)) /
LAG(SUM(amount)) OVER (ORDER BY month) * 100 AS growth_rate
FROM sales_transactions
GROUP BY month
ORDER BY month;
-- Customer analysis with window functions
SELECT
customer_id,
transaction_date,
amount,
SUM(amount) OVER (PARTITION BY customer_id ORDER BY transaction_date) AS running_total,
ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY transaction_date) AS order_sequence,
DENSE_RANK() OVER (ORDER BY amount DESC) AS amount_rank
FROM sales_transactions
ORDER BY customer_id, transaction_date;
-- JSON data processing
CREATE TABLE user_events (
event_id VARCHAR(50),
user_id VARCHAR(50),
event_timestamp TIMESTAMP_NTZ,
event_data VARIANT -- JSON type
);
INSERT INTO user_events VALUES
('E001', 'U001', '2024-01-15 10:30:00',
'{"action": "page_view", "page": "/products", "duration": 120, "referrer": "google.com"}'),
('E002', 'U001', '2024-01-15 10:32:00',
'{"action": "add_to_cart", "product_id": "P001", "quantity": 2, "price": 299.99}');
SELECT
user_id,
event_data:action::STRING AS action,
event_data:page::STRING AS page,
event_data:duration::INTEGER AS duration,
event_data:product_id::STRING AS product_id
FROM user_events
WHERE event_data:action::STRING = 'page_view';
Advanced Features
-- Time Travel (historical data access)
-- Data from 1 hour ago
SELECT * FROM sales_transactions AT (TIMESTAMP => DATEADD(hour, -1, CURRENT_TIMESTAMP()));
-- Data at specific point in time
SELECT * FROM sales_transactions BEFORE (STATEMENT => '01a2b3c4-0001-2345-0000-000012345678');
-- Table cloning (Zero-copy)
CREATE TABLE sales_transactions_backup
CLONE sales_transactions;
-- Tasks (scheduled execution)
CREATE TASK daily_summary_task
WAREHOUSE = compute_wh
SCHEDULE = 'USING CRON 0 2 * * * UTC' -- Daily at 2 AM UTC
AS
INSERT INTO daily_sales_summary
SELECT
CURRENT_DATE() - 1 AS report_date,
region,
SUM(amount) AS daily_revenue,
COUNT(*) AS transaction_count
FROM sales_transactions
WHERE transaction_date = CURRENT_DATE() - 1
GROUP BY region;
-- Start task
ALTER TASK daily_summary_task RESUME;
-- Streams (change data capture)
CREATE STREAM sales_stream ON TABLE sales_transactions;
-- Pipes (automatic data loading)
CREATE PIPE sales_pipe
AUTO_INGEST = TRUE
AS
COPY INTO sales_transactions
FROM @my_stage
FILE_FORMAT = (TYPE = 'CSV');
-- Secure views
CREATE SECURE VIEW sensitive_sales AS
SELECT
region,
category,
SUM(amount) AS revenue,
COUNT(*) AS transactions
FROM sales_transactions
GROUP BY region, category;
Performance Optimization
-- Set clustering key
ALTER TABLE sales_transactions
CLUSTER BY (transaction_date, region);
-- Materialized view
CREATE MATERIALIZED VIEW sales_summary_mv AS
SELECT
DATE_TRUNC('day', transaction_date) AS day,
region,
category,
SUM(amount) AS daily_revenue,
COUNT(*) AS transaction_count,
AVG(amount) AS avg_amount
FROM sales_transactions
GROUP BY day, region, category;
-- Enable automatic clustering
ALTER TABLE sales_transactions
RESUME RECLUSTER;
-- Search optimization service
ALTER TABLE sales_transactions
ADD SEARCH OPTIMIZATION;
-- Result cache utilization (automatic)
-- Same queries are cached for 24 hours
SELECT COUNT(*) FROM sales_transactions
WHERE transaction_date = CURRENT_DATE();
-- Query Acceleration Service
ALTER WAREHOUSE compute_wh
SET ENABLE_QUERY_ACCELERATION = TRUE
QUERY_ACCELERATION_MAX_SCALE_FACTOR = 8;
-- Adjust warehouse size
ALTER WAREHOUSE compute_wh SET WAREHOUSE_SIZE = 'LARGE';
-- Multi-cluster configuration (Enterprise+)
ALTER WAREHOUSE compute_wh SET
MIN_CLUSTER_COUNT = 1
MAX_CLUSTER_COUNT = 5
SCALING_POLICY = 'STANDARD';
Practical Examples
-- Customer segmentation analysis
WITH customer_metrics AS (
SELECT
customer_id,
COUNT(*) AS purchase_frequency,
SUM(amount) AS total_spent,
AVG(amount) AS avg_order_value,
MAX(transaction_date) AS last_purchase_date,
DATEDIFF(day, MAX(transaction_date), CURRENT_DATE()) AS days_since_last_purchase
FROM sales_transactions
GROUP BY customer_id
),
customer_segments AS (
SELECT
customer_id,
purchase_frequency,
total_spent,
avg_order_value,
days_since_last_purchase,
CASE
WHEN total_spent >= 1000 AND days_since_last_purchase <= 30 THEN 'VIP'
WHEN total_spent >= 500 AND days_since_last_purchase <= 60 THEN 'Premium'
WHEN days_since_last_purchase <= 90 THEN 'Active'
WHEN days_since_last_purchase <= 180 THEN 'At Risk'
ELSE 'Inactive'
END AS segment
FROM customer_metrics
)
SELECT
segment,
COUNT(*) AS customer_count,
AVG(total_spent) AS avg_total_spent,
AVG(purchase_frequency) AS avg_frequency
FROM customer_segments
GROUP BY segment
ORDER BY avg_total_spent DESC;
-- Sales forecasting (linear regression approximation)
WITH daily_sales AS (
SELECT
transaction_date,
SUM(amount) AS daily_revenue,
ROW_NUMBER() OVER (ORDER BY transaction_date) AS day_number
FROM sales_transactions
GROUP BY transaction_date
),
regression_stats AS (
SELECT
COUNT(*) AS n,
SUM(day_number) AS sum_x,
SUM(daily_revenue) AS sum_y,
SUM(day_number * daily_revenue) AS sum_xy,
SUM(day_number * day_number) AS sum_x2
FROM daily_sales
)
SELECT
(n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x * sum_x) AS slope,
(sum_y - slope * sum_x) / n AS intercept
FROM regression_stats;
-- Data quality checks
SELECT
'sales_transactions' AS table_name,
'Null customer_id' AS check_type,
COUNT(*) AS issue_count
FROM sales_transactions
WHERE customer_id IS NULL
UNION ALL
SELECT
'sales_transactions',
'Negative amount',
COUNT(*)
FROM sales_transactions
WHERE amount < 0
UNION ALL
SELECT
'sales_transactions',
'Future dates',
COUNT(*)
FROM sales_transactions
WHERE transaction_date > CURRENT_DATE();
Security and Governance
-- Create roles
CREATE ROLE sales_analyst;
CREATE ROLE sales_manager;
-- Grant permissions
GRANT USAGE ON DATABASE analytics_db TO ROLE sales_analyst;
GRANT USAGE ON SCHEMA sales_data TO ROLE sales_analyst;
GRANT SELECT ON TABLE sales_transactions TO ROLE sales_analyst;
-- Masking policy
CREATE MASKING POLICY customer_id_mask AS (val STRING)
RETURNS STRING ->
CASE
WHEN CURRENT_ROLE() IN ('SALES_MANAGER', 'ADMIN') THEN val
ELSE 'MASKED'
END;
-- Apply masking
ALTER TABLE sales_transactions
MODIFY COLUMN customer_id
SET MASKING POLICY customer_id_mask;
-- Row-level security
CREATE ROW ACCESS POLICY region_policy AS (region STRING)
RETURNS BOOLEAN ->
CASE
WHEN CURRENT_ROLE() = 'ADMIN' THEN TRUE
WHEN CURRENT_ROLE() = 'NORTH_SALES' AND region = 'North' THEN TRUE
WHEN CURRENT_ROLE() = 'SOUTH_SALES' AND region = 'South' THEN TRUE
ELSE FALSE
END;
-- Apply row policy
ALTER TABLE sales_transactions
ADD ROW ACCESS POLICY region_policy ON (region);
-- Network policy
CREATE NETWORK POLICY office_network
ALLOWED_IP_LIST = ('192.168.1.0/24', '10.0.0.0/8')
BLOCKED_IP_LIST = ('192.168.1.99');
-- Apply network policy to user
ALTER USER sales_user SET NETWORK_POLICY = office_network;
Monitoring and Management
-- Warehouse usage monitoring
SELECT
warehouse_name,
DATE_TRUNC('day', start_time) AS day,
SUM(credits_used) AS daily_credits,
AVG(credits_used) AS avg_credits_per_hour
FROM SNOWFLAKE.ACCOUNT_USAGE.WAREHOUSE_METERING_HISTORY
WHERE start_time >= DATEADD(day, -30, CURRENT_TIMESTAMP())
GROUP BY warehouse_name, day
ORDER BY day DESC, daily_credits DESC;
-- Query history analysis
SELECT
user_name,
warehouse_name,
database_name,
query_type,
execution_status,
total_elapsed_time / 1000 AS execution_seconds,
bytes_scanned,
rows_produced,
credits_used_cloud_services,
query_text
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
WHERE start_time >= DATEADD(hour, -24, CURRENT_TIMESTAMP())
AND total_elapsed_time > 10000 -- Queries longer than 10 seconds
ORDER BY total_elapsed_time DESC
LIMIT 20;
-- Storage usage
SELECT
database_name,
schema_name,
table_name,
bytes / (1024*1024*1024) AS size_gb,
row_count
FROM SNOWFLAKE.ACCOUNT_USAGE.TABLE_STORAGE_METRICS
WHERE deleted = FALSE
ORDER BY bytes DESC
LIMIT 20;
-- Data sharing monitoring
SELECT
consumer_account_name,
database_name,
share_name,
date,
credits_used
FROM SNOWFLAKE.ACCOUNT_USAGE.DATA_TRANSFER_HISTORY
WHERE start_time >= DATEADD(day, -7, CURRENT_TIMESTAMP())
ORDER BY credits_used DESC;
-- Performance analysis
SELECT
query_id,
execution_status,
warehouse_name,
total_elapsed_time,
compilation_time,
execution_time,
bytes_scanned,
percentage_scanned_from_cache,
query_load_percent
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
WHERE start_time >= DATEADD(hour, -1, CURRENT_TIMESTAMP())
AND warehouse_name = 'COMPUTE_WH'
ORDER BY total_elapsed_time DESC;