Technology Catalog | Developer's Catalog

Database

Amazon Neptune

Overview

Amazon Neptune is a fully managed graph database service provided by AWS. It supports both PropertyGraph (Apache TinkerPop Gremlin) and RDF (SPARQL 1.1) graph models, delivering high availability, automatic scaling, and reduced operational overhead. Neptune seamlessly integrates with other AWS services and can process large-scale graph data with billions of relationships at millisecond latency.

Details

Amazon Neptune was officially released by AWS in 2018. Unlike traditional graph databases, it's delivered as a fully managed service, eliminating the need for infrastructure management and maintenance. It provides auto-scaling storage up to 64TB, six data replicas across three Availability Zones, and automatic backup with point-in-time recovery.

Key Features of Amazon Neptune:

Dual graph engine (Property Graph + RDF)
Apache TinkerPop Gremlin query language support
SPARQL 1.1 compliant RDF queries
Query processing capacity up to 100,000 QPS
VPC execution with KMS encryption
Fine-grained access control via AWS IAM
Integration with Amazon Athena for SQL analytics
Automatic failover and read replicas
Machine learning integration with Amazon SageMaker
Connection support from AWS Lambda, EC2, EKS

Pros and Cons

Pros

Fully Managed: Automated infrastructure management, patching, and backups
High Availability: 99.99% availability SLA with 3-AZ configuration
Auto Scaling: Independent scaling of storage and compute
Dual Engine: Native support for both Gremlin and SPARQL
AWS Ecosystem: Seamless integration with other AWS services
Security: Integrated security with VPC, IAM, and KMS encryption
Performance: High-speed processing with dedicated graph engine
Reduced Operations: Integrated monitoring, alerting, and metrics

Cons

Vendor Lock-in: AWS platform dependency
Cost: Fixed instance fees even for small-scale usage
Learning Curve: Need to master both Gremlin and SPARQL
Customization Limits: Configuration constraints due to managed service
Region Restrictions: Limited availability in certain regions
Complex Pricing: Combination of instance, storage, and I/O charges

Main Links

Code Examples

Setup and Connection Configuration

# AWS CLI configuration
aws configure
# Set access key, secret key, and region

# Install Python Gremlin client
pip install gremlinpython boto3

# Install Python SPARQL client
pip install rdflib requests

# Java dependencies (Maven)
<dependency>
    <groupId>org.apache.tinkerpop</groupId>
    <artifactId>gremlin-driver</artifactId>
    <version>3.6.2</version>
</dependency>

<dependency>
    <groupId>org.eclipse.rdf4j</groupId>
    <artifactId>rdf4j-repository-sparql</artifactId>
    <version>3.7.4</version>
</dependency>

# Network configuration check
# Open ports 8182 (Gremlin) and 8183 (SPARQL) in Neptune VPC security group

Basic Operations (Gremlin)

# Gremlin connection and CRUD operations
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import traversal
import boto3

# Neptune connection
database_url = "wss://your-neptune-endpoint.region.neptune.amazonaws.com:8182/gremlin"
remoteConn = DriverRemoteConnection(database_url, "g")
g = traversal().withRemote(remoteConn)

# Create vertices
person = g.addV("Person").property("name", "John Smith").property("age", 30).next()
company = g.addV("Company").property("name", "Tech Corp").property("industry", "IT").next()

# Create edges
g.V(person).addE("WORKS_FOR").to(g.V(company)).property("since", 2020).iterate()

# Read vertices
persons = g.V().hasLabel("Person").toList()
print(f"Persons found: {len(persons)}")

# Search with specific conditions
result = g.V().hasLabel("Person").has("name", "John Smith").valueMap().toList()
print(result)

# Relationship queries
paths = g.V().hasLabel("Person").has("name", "John Smith")\
    .outE("WORKS_FOR").inV().hasLabel("Company")\
    .path().by("name").toList()

# Update vertices
g.V().hasLabel("Person").has("name", "John Smith")\
    .property("age", 31).property("location", "New York").iterate()

# Delete vertices
g.V().hasLabel("Person").has("name", "John Smith").drop().iterate()

# Close connection
remoteConn.close()

Basic Operations (SPARQL)

# SPARQL connection and CRUD operations
import requests
import json

sparql_endpoint = "https://your-neptune-endpoint.region.neptune.amazonaws.com:8183/sparql"

# Insert RDF triples (Create)
insert_query = """
INSERT DATA {
    <http://example.org/person/john> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Person> .
    <http://example.org/person/john> <http://example.org/name> "John Smith" .
    <http://example.org/person/john> <http://example.org/age> "30"^^<http://www.w3.org/2001/XMLSchema#int> .
    <http://example.org/company/tech> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Company> .
    <http://example.org/company/tech> <http://example.org/name> "Tech Corp" .
    <http://example.org/person/john> <http://example.org/worksFor> <http://example.org/company/tech> .
}
"""

response = requests.post(sparql_endpoint, 
                        data={'update': insert_query},
                        headers={'Content-Type': 'application/x-www-form-urlencoded'})

# Query data (Read)
select_query = """
SELECT ?person ?name ?age WHERE {
    ?person <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Person> .
    ?person <http://example.org/name> ?name .
    ?person <http://example.org/age> ?age .
}
"""

response = requests.post(sparql_endpoint,
                        data={'query': select_query},
                        headers={'Content-Type': 'application/x-www-form-urlencoded'})
results = response.json()

# Relationship queries
relationship_query = """
SELECT ?person ?company WHERE {
    ?person <http://example.org/worksFor> ?company .
    ?person <http://example.org/name> "John Smith" .
}
"""

# Update data
update_query = """
DELETE { <http://example.org/person/john> <http://example.org/age> ?oldAge }
INSERT { <http://example.org/person/john> <http://example.org/age> "31"^^<http://www.w3.org/2001/XMLSchema#int> }
WHERE { <http://example.org/person/john> <http://example.org/age> ?oldAge }
"""

# Delete data
delete_query = """
DELETE WHERE {
    <http://example.org/person/john> ?p ?o .
}
"""

Data Modeling and Schema Design

# Complex data modeling with Gremlin
# E-commerce system modeling

# Create users, products, and categories
def create_ecommerce_model():
    # Create user
    user1 = g.addV("User").property("id", "user001").property("name", "Alice Johnson")\
        .property("email", "[email protected]").property("joinDate", "2024-01-15").next()
    
    # Create products and categories
    laptop = g.addV("Product").property("id", "prod001").property("name", "Laptop")\
        .property("price", 1200).property("brand", "TechBrand").next()
        
    mouse = g.addV("Product").property("id", "prod002").property("name", "Wireless Mouse")\
        .property("price", 30).property("brand", "TechBrand").next()
        
    category = g.addV("Category").property("name", "Computers").property("level", 1).next()
    
    # Create relationships
    g.V(user1).addE("PURCHASED").to(g.V(laptop))\
        .property("date", "2024-02-01").property("quantity", 1).iterate()
        
    g.V(user1).addE("VIEWED").to(g.V(mouse))\
        .property("timestamp", "2024-02-15T10:30:00").iterate()
        
    g.V(laptop).addE("BELONGS_TO").to(g.V(category)).iterate()
    g.V(mouse).addE("BELONGS_TO").to(g.V(category)).iterate()

# Ontology design with SPARQL
ontology_insert = """
PREFIX ex: <http://example.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

INSERT DATA {
    ex:User rdf:type rdfs:Class .
    ex:Product rdf:type rdfs:Class .
    ex:Category rdf:type rdfs:Class .
    
    ex:hasName rdf:type rdf:Property .
    ex:hasPrice rdf:type rdf:Property .
    ex:belongsToCategory rdf:type rdf:Property .
    ex:purchasedBy rdf:type rdf:Property .
    
    ex:user001 rdf:type ex:User .
    ex:user001 ex:hasName "Alice Johnson" .
    ex:prod001 rdf:type ex:Product .
    ex:prod001 ex:hasName "Laptop" .
    ex:prod001 ex:hasPrice "1200"^^<http://www.w3.org/2001/XMLSchema#int> .
}
"""

Performance Optimization and Indexing

# Gremlin index optimization strategies
# Neptune automatically manages indexes, but query pattern optimization is important

# Efficient query patterns
def optimized_queries():
    # Efficient search using indexes
    # Filter first using has()
    result = g.V().hasLabel("User").has("email", "[email protected]")\
        .outE("PURCHASED").inV().hasLabel("Product").valueMap().toList()
    
    # Multiple property filtering
    expensive_products = g.V().hasLabel("Product")\
        .has("price", P.gte(1000))\
        .has("category", "electronics").toList()
    
    # Explicitly specify edge direction
    user_purchases = g.V().hasLabel("User").has("id", "user001")\
        .out("PURCHASED").hasLabel("Product").toList()

# Query profiling
def profile_query():
    # Analyze query performance with profile()
    profile_result = g.V().hasLabel("User")\
        .out("PURCHASED").hasLabel("Product")\
        .profile().toList()
    print(profile_result)

# SPARQL query optimization
optimized_sparql = """
PREFIX ex: <http://example.org/>

SELECT ?user ?product ?price WHERE {
    ?user a ex:User .
    ?user ex:purchased ?product .
    ?product ex:hasPrice ?price .
    FILTER(?price > 500)
}
ORDER BY DESC(?price)
LIMIT 10
"""

# Batch operation optimization
def batch_operations():
    # Execute multiple operations together
    batch_traversal = g.addV("User").property("name", "User1")\
        .as_("user1")\
        .addV("Product").property("name", "Product1")\
        .as_("product1")\
        .addE("PURCHASED").from_("user1").to("product1")
    
    batch_traversal.iterate()

Advanced Analytics Queries

# Advanced graph analytics with Gremlin
def advanced_graph_analytics():
    # Shortest path search
    shortest_path = g.V().hasLabel("User").has("name", "Alice Johnson")\
        .repeat(outE().otherV().simplePath())\
        .until(hasLabel("Product").has("name", "Laptop"))\
        .path().limit(1).toList()
    
    # Recommendation algorithm (collaborative filtering)
    recommendations = g.V().hasLabel("User").has("name", "Alice Johnson")\
        .out("PURCHASED").aggregate("purchased")\
        .in_("PURCHASED").where(P.neq("Alice Johnson"))\
        .out("PURCHASED").where(P.without("purchased"))\
        .groupCount().order().by(Column.values, Order.desc)\
        .limit(5).toList()
    
    # Network centrality analysis
    centrality = g.V().hasLabel("Product")\
        .project("product", "connections")\
        .by("name")\
        .by(bothE().count())\
        .order().by(Column.values, Order.desc)\
        .limit(10).toList()
    
    # Community detection (triangle search)
    triangles = g.V().hasLabel("User").as_("a")\
        .out("FRIENDS_WITH").as_("b")\
        .out("FRIENDS_WITH").as_("c")\
        .where("c", P.eq("a"))\
        .select("a", "b", "c").by("name").toList()

# Advanced SPARQL analytics
advanced_sparql_queries = """
# PageRank-style importance calculation
PREFIX ex: <http://example.org/>

SELECT ?product (COUNT(?purchase) AS ?popularity) WHERE {
    ?user ex:purchased ?product .
    ?product ex:hasName ?productName .
}
GROUP BY ?product
ORDER BY DESC(?popularity)
LIMIT 10

# Time series analysis
SELECT ?month (COUNT(?purchase) AS ?sales) WHERE {
    ?user ex:purchased ?product .
    ?purchase ex:date ?date .
    BIND(SUBSTR(?date, 1, 7) AS ?month)
}
GROUP BY ?month
ORDER BY ?month

# Graph pattern matching
SELECT ?user1 ?user2 ?commonProduct WHERE {
    ?user1 ex:purchased ?commonProduct .
    ?user2 ex:purchased ?commonProduct .
    FILTER(?user1 != ?user2)
}
"""

AWS Integration and Operations Monitoring

# AWS Neptune monitoring and management
import boto3

# CloudWatch metrics retrieval
def get_neptune_metrics():
    cloudwatch = boto3.client('cloudwatch')
    
    # Query execution time metrics
    response = cloudwatch.get_metric_statistics(
        Namespace='AWS/Neptune',
        MetricName='GremlinRequestsPerSec',
        Dimensions=[
            {
                'Name': 'DBClusterIdentifier',
                'Value': 'your-neptune-cluster'
            }
        ],
        StartTime='2024-01-01T00:00:00Z',
        EndTime='2024-01-02T00:00:00Z',
        Period=300,
        Statistics=['Average', 'Maximum']
    )
    
    return response['Datapoints']

# IAM role-based access control
def create_neptune_connection_with_iam():
    from botocore.auth import SigV4Auth
    from botocore.awsrequest import AWSRequest
    
    # Neptune connection with IAM authentication
    session = boto3.Session()
    credentials = session.get_credentials()
    
    # Signed Gremlin connection (WebSocket)
    database_url = "wss://your-neptune-endpoint.region.neptune.amazonaws.com:8182/gremlin"
    
    # Custom authentication handler implementation
    def create_signed_connection():
        # Create WebSocket connection with SigV4 signature
        pass

# Neptune operations in Lambda function
def lambda_handler(event, context):
    try:
        # Gremlin connection
        database_url = "wss://your-neptune-endpoint.region.neptune.amazonaws.com:8182/gremlin"
        remoteConn = DriverRemoteConnection(database_url, "g")
        g = traversal().withRemote(remoteConn)
        
        # Execute business logic
        result = g.V().hasLabel("User").has("id", event['userId'])\
            .out("PURCHASED").hasLabel("Product").valueMap().toList()
        
        remoteConn.close()
        
        return {
            'statusCode': 200,
            'body': json.dumps(result)
        }
    except Exception as e:
        return {
            'statusCode': 500,
            'body': json.dumps({'error': str(e)})
        }

# Backup and recovery management
def neptune_backup_management():
    neptune = boto3.client('neptune')
    
    # Create manual snapshot
    response = neptune.create_db_cluster_snapshot(
        DBSnapshotIdentifier='manual-snapshot-2024-01',
        DBClusterIdentifier='your-neptune-cluster'
    )
    
    # Point-in-time recovery
    response = neptune.restore_db_cluster_to_point_in_time(
        DBClusterIdentifier='restored-cluster',
        SourceDBClusterIdentifier='your-neptune-cluster',
        RestoreToTime='2024-01-01T12:00:00Z'
    )

Practical Use Cases Implementation

# Fraud detection system implementation
def fraud_detection_system():
    # Detect suspicious transaction patterns
    suspicious_transactions = g.V().hasLabel("Account")\
        .outE("TRANSFER")\
        .has("amount", P.gt(100000))\
        .has("timestamp", P.gt("2024-01-01"))\
        .inV().hasLabel("Account")\
        .groupCount().by("accountId")\
        .unfold()\
        .where(Column.values, P.gt(10))\
        .toList()
    
    # Ring money transfer detection
    money_laundering = g.V().hasLabel("Account").as_("start")\
        .repeat(outE("TRANSFER").inV().simplePath())\
        .until(where(P.eq("start")))\
        .path().toList()
    
    return suspicious_transactions, money_laundering

# Social network analysis
def social_network_analysis():
    # Detect influential users
    influencers = g.V().hasLabel("User")\
        .project("user", "followers", "influence")\
        .by("name")\
        .by(inE("FOLLOWS").count())\
        .by(inE("FOLLOWS").outV().outE("LIKES").count().mean())\
        .order().by(Column.values.get(2), Order.desc)\
        .limit(10).toList()
    
    # Community detection (modularity maximization)
    communities = g.V().hasLabel("User")\
        .aggregate("allUsers")\
        .repeat(outE("FRIENDS_WITH").otherV().simplePath())\
        .until(__.where(P.within("allUsers")).count().is_(P.gt(5)))\
        .path().toList()

# Knowledge graph system
knowledge_graph_sparql = """
PREFIX ex: <http://example.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

# Entity relationship inference
SELECT ?entity1 ?relationship ?entity2 WHERE {
    ?entity1 ?relationship ?entity2 .
    ?entity1 rdf:type ex:Person .
    ?entity2 rdf:type ex:Organization .
    FILTER(?relationship != rdf:type)
}

# Hierarchical structure search
SELECT ?parent ?child ?depth WHERE {
    ex:root_entity (ex:hasChild+) ?child .
    ex:root_entity (ex:hasChild*) ?parent .
    ?parent ex:hasChild ?child .
}

# Lexical similarity search
SELECT ?concept ?similarity WHERE {
    ?concept rdfs:label ?label .
    FILTER(CONTAINS(LCASE(?label), "technology"))
    BIND(1.0 AS ?similarity)
}
ORDER BY DESC(?similarity)
"""

# Recommendation engine
def recommendation_engine():
    # Collaborative filtering
    collaborative_filtering = g.V().hasLabel("User").has("id", "target_user")\
        .out("PURCHASED").aggregate("purchased")\
        .in_("PURCHASED").where(P.neq("target_user"))\
        .as_("similar_user")\
        .out("PURCHASED").where(P.without("purchased"))\
        .groupCount().by("productId")\
        .order().by(Column.values, Order.desc)\
        .limit(10).toList()
    
    # Content-based filtering
    content_based = g.V().hasLabel("User").has("id", "target_user")\
        .out("PURCHASED").hasLabel("Product")\
        .out("BELONGS_TO").hasLabel("Category")\
        .in_("BELONGS_TO").hasLabel("Product")\
        .where(P.without("purchased"))\
        .groupCount().by("productId")\
        .order().by(Column.values, Order.desc)\
        .limit(5).toList()
    
    return collaborative_filtering, content_based