Database
ClickHouse
Overview
ClickHouse is a real-time analytics database management system. Utilizing a columnar storage architecture, it enables high-speed query execution on large datasets. Designed specifically for OLAP (Online Analytical Processing), it can return query results in seconds even on datasets with billions of rows.
Details
ClickHouse was developed by Yandex in 2016 and is currently developed and maintained by ClickHouse Inc. Through columnar storage, vectorized execution engines, and aggressive data compression, it enables high-speed analytics that were difficult to achieve with traditional OLTP databases. It supports distributed architecture and can handle large-scale data processing through horizontal scaling.
Key features of ClickHouse:
- Columnar storage engine
- Vectorized query execution
- Advanced data compression
- Distributed processing and sharding
- SQL-like query language
- High-speed data ingestion
- Real-time analytics
- Support for diverse data types
- Rich aggregation functions
- HTTP/TCP interfaces
Advantages and Disadvantages
Advantages
- Ultra-fast: High-speed query execution through columnar storage
- High compression: Reduced storage costs through efficient data compression
- Scalability: Horizontal scaling for large volumes of data
- Real-time: Immediate analysis of inserted data
- SQL compatibility: Low learning cost with standard SQL-like syntax
- Rich functionality: Statistical functions, time-series analysis, approximate calculations
- Open source: Commercial use available under Apache 2.0 license
Disadvantages
- Update limitations: Not suitable for frequent updates and deletions
- OLTP unsuitability: Not intended for transaction processing
- Complexity: Complex configuration and operation for distributed setups
- Memory consumption: Can consume large amounts of memory for large queries
- Join limitations: Heavy joins between large tables can be expensive
Key Links
Code Examples
Installation & Setup
# Run with Docker (recommended)
docker run -d --name clickhouse-server \
--ulimit nofile=262144:262144 \
-p 8123:8123 -p 9000:9000 \
-v clickhouse-data:/var/lib/clickhouse \
clickhouse/clickhouse-server
# Ubuntu/Debian
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
sudo apt update && sudo apt install -y clickhouse-server clickhouse-client
# Red Hat/CentOS
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo https://packages.clickhouse.com/rpm/clickhouse.repo
sudo yum install -y clickhouse-server clickhouse-client
# macOS (Homebrew)
brew install clickhouse
# Start service
sudo systemctl start clickhouse-server
sudo systemctl enable clickhouse-server
# Connect client
clickhouse-client
# Web interface
# http://localhost:8123/play
Basic Operations (DDL/DML)
-- Create database
CREATE DATABASE analytics;
USE analytics;
-- Create table (MergeTree engine)
CREATE TABLE events
(
event_time DateTime,
user_id UInt32,
session_id String,
page_url String,
referrer String,
user_agent String,
country_code FixedString(2),
device_type Enum('desktop' = 1, 'mobile' = 2, 'tablet' = 3),
duration UInt32,
bytes_downloaded UInt64
)
ENGINE = MergeTree()
ORDER BY (event_time, user_id)
PARTITION BY toYYYYMM(event_time);
-- Insert data
INSERT INTO events VALUES
('2024-01-15 10:30:00', 1001, 'sess001', '/home', 'https://google.com', 'Mozilla/5.0...', 'JP', 'desktop', 120, 204800),
('2024-01-15 10:31:00', 1002, 'sess002', '/products', 'https://google.com', 'Mozilla/5.0...', 'US', 'mobile', 45, 51200),
('2024-01-15 10:32:00', 1001, 'sess001', '/about', '/home', 'Mozilla/5.0...', 'JP', 'desktop', 90, 153600);
-- Bulk insert (from CSV file)
INSERT INTO events
SELECT *
FROM file('/path/to/events.csv', 'CSV',
'event_time DateTime, user_id UInt32, session_id String, page_url String, referrer String, user_agent String, country_code FixedString(2), device_type String, duration UInt32, bytes_downloaded UInt64');
-- Update data (ALTER UPDATE)
ALTER TABLE events UPDATE device_type = 'mobile' WHERE user_agent LIKE '%Mobile%';
-- Delete data (ALTER DELETE)
ALTER TABLE events DELETE WHERE event_time < '2024-01-01';
Queries & Analytics
-- Basic aggregation
SELECT
toDate(event_time) AS date,
country_code,
device_type,
count() AS page_views,
uniq(user_id) AS unique_users,
avg(duration) AS avg_duration,
sum(bytes_downloaded) AS total_bytes
FROM events
WHERE event_time >= '2024-01-01'
GROUP BY date, country_code, device_type
ORDER BY date DESC, page_views DESC;
-- Time series analysis
SELECT
toStartOfHour(event_time) AS hour,
count() AS events,
uniq(user_id) AS unique_users,
quantile(0.5)(duration) AS median_duration,
quantile(0.95)(duration) AS p95_duration
FROM events
WHERE event_time >= today() - 7
GROUP BY hour
ORDER BY hour;
-- User behavior analysis
SELECT
user_id,
count() AS page_views,
sum(duration) AS total_time,
groupArray(page_url) AS visited_pages,
min(event_time) AS first_visit,
max(event_time) AS last_visit
FROM events
WHERE toDate(event_time) = today()
GROUP BY user_id
HAVING page_views > 5
ORDER BY total_time DESC
LIMIT 100;
-- Cohort analysis
SELECT
toYYYYMM(min_event_time) AS cohort_month,
dateDiff('month', min_event_time, event_time) AS period_number,
count(DISTINCT user_id) AS users
FROM (
SELECT
user_id,
event_time,
min(event_time) OVER (PARTITION BY user_id) AS min_event_time
FROM events
)
GROUP BY cohort_month, period_number
ORDER BY cohort_month, period_number;
Advanced Analytics
-- Window functions
SELECT
user_id,
event_time,
page_url,
lagInFrame(page_url) OVER (
PARTITION BY user_id
ORDER BY event_time
ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
) AS previous_page,
row_number() OVER (
PARTITION BY user_id
ORDER BY event_time
) AS page_sequence
FROM events
ORDER BY user_id, event_time;
-- Array functions
SELECT
user_id,
groupArray(page_url) AS user_journey,
arrayStringConcat(groupArray(page_url), ' -> ') AS journey_path,
arrayElement(groupArray(page_url), 1) AS landing_page,
arrayElement(groupArray(page_url), -1) AS exit_page
FROM events
GROUP BY user_id
HAVING length(user_journey) > 3;
-- Approximate calculations (HyperLogLog)
SELECT
country_code,
uniq(user_id) AS exact_unique_users,
uniqHLL12(user_id) AS approx_unique_users
FROM events
GROUP BY country_code;
-- JSON data processing
CREATE TABLE user_events
(
user_id UInt32,
event_time DateTime,
properties String
)
ENGINE = MergeTree()
ORDER BY (event_time, user_id);
INSERT INTO user_events VALUES
(1001, '2024-01-15 10:30:00', '{"product_id": 123, "category": "electronics", "price": 299.99}');
SELECT
user_id,
JSONExtractString(properties, 'category') AS category,
JSONExtractFloat(properties, 'price') AS price
FROM user_events
WHERE JSONHas(properties, 'price');
Performance Optimization
-- Table compression settings
CREATE TABLE compressed_events
(
event_time DateTime CODEC(DoubleDelta),
user_id UInt32 CODEC(Delta, LZ4),
page_url String CODEC(LZ4),
duration UInt32 CODEC(Delta, LZ4)
)
ENGINE = MergeTree()
ORDER BY (event_time, user_id)
PARTITION BY toYYYYMM(event_time);
-- Projections (pre-aggregation)
ALTER TABLE events
ADD PROJECTION daily_stats
(
SELECT
toDate(event_time) AS date,
country_code,
count(),
uniq(user_id)
GROUP BY date, country_code
);
-- Materialized views
CREATE MATERIALIZED VIEW daily_summary
ENGINE = MergeTree()
ORDER BY (date, country_code)
AS SELECT
toDate(event_time) AS date,
country_code,
device_type,
count() AS events,
uniq(user_id) AS unique_users,
avg(duration) AS avg_duration
FROM events
GROUP BY date, country_code, device_type;
-- Execution plan analysis
EXPLAIN PIPELINE
SELECT count(), avg(duration)
FROM events
WHERE event_time >= '2024-01-01'
AND country_code = 'JP';
Distributed Tables
-- Replicated tables
CREATE TABLE events_replica ON CLUSTER cluster_name
(
event_time DateTime,
user_id UInt32,
page_url String
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/events', '{replica}')
ORDER BY (event_time, user_id)
PARTITION BY toYYYYMM(event_time);
-- Distributed table
CREATE TABLE events_distributed ON CLUSTER cluster_name
AS events_replica
ENGINE = Distributed(cluster_name, analytics, events_replica, rand());
-- Check cluster information
SELECT * FROM system.clusters;
-- Distributed query
SELECT
count() AS total_events,
uniq(user_id) AS unique_users
FROM events_distributed
WHERE event_time >= today() - 30;
Practical Examples
-- Real-time dashboard queries
SELECT
toStartOfMinute(event_time) AS minute,
count() AS requests_per_minute,
countIf(page_url = '/checkout') AS checkout_views,
uniq(user_id) AS active_users,
avg(duration) AS avg_page_load_time
FROM events
WHERE event_time >= now() - INTERVAL 1 HOUR
GROUP BY minute
ORDER BY minute DESC
LIMIT 60;
-- A/B testing analysis
WITH experiment_users AS (
SELECT user_id, if(user_id % 2 = 0, 'A', 'B') AS variant
FROM (SELECT DISTINCT user_id FROM events)
)
SELECT
e.variant,
count() AS page_views,
uniq(ev.user_id) AS unique_users,
countIf(ev.page_url = '/purchase') AS conversions,
conversions / unique_users AS conversion_rate
FROM events ev
JOIN experiment_users e ON ev.user_id = e.user_id
WHERE ev.event_time >= '2024-01-01'
GROUP BY e.variant;
-- Anomaly detection
SELECT
toStartOfHour(event_time) AS hour,
count() AS events,
avg(count()) OVER (
ORDER BY hour
ROWS BETWEEN 23 PRECEDING AND 1 PRECEDING
) AS avg_previous_24h,
events / avg_previous_24h AS anomaly_ratio
FROM events
WHERE event_time >= now() - INTERVAL 48 HOUR
GROUP BY hour
HAVING anomaly_ratio > 2 OR anomaly_ratio < 0.5
ORDER BY hour DESC;
External Data Integration
-- Reading data from S3
SELECT count()
FROM s3('https://bucket.s3.amazonaws.com/events/*.csv.gz',
'AccessKeyId', 'SecretAccessKey',
'CSV', 'event_time DateTime, user_id UInt32, action String');
-- MySQL engine table
CREATE TABLE mysql_users
(
id UInt32,
name String,
email String
)
ENGINE = MySQL('mysql_server:3306', 'database', 'users', 'username', 'password');
-- URL data retrieval
SELECT *
FROM url('https://api.example.com/data.json', 'JSONEachRow')
LIMIT 10;
-- Kafka integration
CREATE TABLE kafka_events
(
event_time DateTime,
user_id UInt32,
action String
)
ENGINE = Kafka
SETTINGS kafka_broker_list = 'localhost:9092',
kafka_topic_list = 'events',
kafka_group_name = 'clickhouse_consumer',
kafka_format = 'JSONEachRow';
System Management & Monitoring
-- Check table sizes
SELECT
table,
formatReadableSize(sum(bytes)) AS size,
sum(rows) AS rows,
count() AS parts
FROM system.parts
WHERE active
GROUP BY table
ORDER BY sum(bytes) DESC;
-- Query history analysis
SELECT
type,
event_time,
query_duration_ms,
formatReadableSize(memory_usage) AS memory,
read_rows,
substring(query, 1, 100) AS query_preview
FROM system.query_log
WHERE event_time >= now() - INTERVAL 1 HOUR
AND type = 'QueryFinish'
ORDER BY query_duration_ms DESC
LIMIT 10;
-- Profiling
SELECT
ProfileEvents.Names,
ProfileEvents.Values
FROM system.query_log
WHERE query_id = 'YOUR_QUERY_ID'
AND type = 'QueryFinish'
FORMAT Vertical;
-- Background process monitoring
SELECT * FROM system.metrics
WHERE metric LIKE 'Background%';
-- Replication status
SELECT * FROM system.replicas;
Configuration & Tuning
<!-- Key settings in config.xml -->
<clickhouse>
<max_connections>4096</max_connections>
<keep_alive_timeout>3</keep_alive_timeout>
<max_concurrent_queries>100</max_concurrent_queries>
<merge_tree>
<max_suspicious_broken_parts>5</max_suspicious_broken_parts>
<parts_to_delay_insert>150</parts_to_delay_insert>
<parts_to_throw_insert>300</parts_to_throw_insert>
</merge_tree>
<distributed_ddl>
<path>/clickhouse/task_queue/ddl</path>
</distributed_ddl>
<compression>
<case>
<min_part_size>10000000000</min_part_size>
<min_part_size_ratio>0.01</min_part_size_ratio>
<method>lz4hc</method>
</case>
</compression>
</clickhouse>
-- Session settings
SET max_memory_usage = 10000000000;
SET max_execution_time = 300;
SET optimize_use_projections = 1;
SET allow_experimental_analyzer = 1;
-- Check global settings
SELECT name, value FROM system.settings
WHERE name LIKE '%memory%'
ORDER BY name;