Skip to main content

Online Endpoints for Real-Time Inference

Online endpoints provide real-time inference for machine learning models with low latency, automatic scaling, and built-in monitoring.
Managed online endpoints handle infrastructure, scaling, and security automatically - you focus on your model.

What are Online Endpoints?

Online endpoints deploy models to web servers that return predictions via HTTP. They’re optimized for:

Low Latency

Sub-second response times

Synchronous Requests

Request-response pattern

Real-Time Scoring

Immediate predictions

Auto Scaling

Handle traffic spikes

When to Use Online Endpoints

Choose online endpoints when:
1

Low Latency Required

Need predictions in <1 second
2

Request-Response Pattern

Client waits for immediate response
3

Small Payloads

Input data fits in HTTP request
4

Variable Traffic

Need to scale based on requests

Managed Online Endpoints

Recommended deployment method with full infrastructure management.

Key Features

Fully Managed:
  • Automatic compute provisioning
  • OS updates and patching
  • Node recovery on failure
  • Load balancing
  • SSL termination

Create Online Endpoint

Step 1: Define Endpoint

from azure.ai.ml import MLClient
from azure.ai.ml.entities import ManagedOnlineEndpoint
from azure.identity import DefaultAzureCredential

ml_client = MLClient(
    DefaultAzureCredential(),
    subscription_id="<subscription-id>",
    resource_group="<resource-group>",
    workspace_name="<workspace>"
)

# Create endpoint
endpoint = ManagedOnlineEndpoint(
    name="fraud-detection-api",
    description="Real-time fraud detection endpoint",
    auth_mode="key",  # or "aml_token" for Entra ID
    tags={
        "environment": "production",
        "team": "ml-ops",
        "cost-center": "engineering"
    }
)

ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print(f"Endpoint URI: {endpoint.scoring_uri}")

Step 2: Create Deployment

from azure.ai.ml.entities import (
    ManagedOnlineDeployment,
    Model,
    OnlineScaleSettings
)

deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name="fraud-detection-api",
    model=Model(path="./mlflow-model"),
    instance_type="Standard_DS3_v2",
    instance_count=2,
    scale_settings=OnlineScaleSettings(
        scale_type="TargetUtilization",
        min_instances=1,
        max_instances=5,
        target_utilization_percentage=70
    ),
    request_settings={
        "request_timeout_ms": 90000,
        "max_concurrent_requests_per_instance": 3
    },
    liveness_probe={
        "initial_delay": 10,
        "period": 10,
        "timeout": 2,
        "failure_threshold": 3
    },
    readiness_probe={
        "initial_delay": 10,
        "period": 10,
        "timeout": 2,
        "failure_threshold": 3,
        "success_threshold": 1
    },
    environment_variables={
        "MODEL_VERSION": "1.2.0",
        "LOG_LEVEL": "INFO"
    }
)

ml_client.online_deployments.begin_create_or_update(deployment).result()

Step 3: Route Traffic

# Route 100% traffic to blue deployment
endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

print(f"Endpoint ready: {endpoint.scoring_uri}")

Scoring Script

For custom models, provide a scoring script:
# score.py
import json
import logging
import os
import joblib
import numpy as np
from time import time

def init():
    """
    Called when deployment is created or updated.
    Load model and set up resources.
    """
    global model
    global model_version
    
    # Get model path
    model_path = os.path.join(
        os.getenv("AZUREML_MODEL_DIR"),
        "model.pkl"
    )
    
    # Load model
    model = joblib.load(model_path)
    model_version = os.getenv("MODEL_VERSION", "unknown")
    
    logging.info(f"Model loaded: version {model_version}")

def run(raw_data):
    """
    Called for each inference request.
    Args:
        raw_data: JSON string with input data
    Returns:
        JSON string with predictions
    """
    start_time = time()
    
    try:
        # Parse input
        data = json.loads(raw_data)
        features = np.array(data["features"])
        
        # Make prediction
        predictions = model.predict(features)
        probabilities = model.predict_proba(features)
        
        # Prepare response
        result = {
            "predictions": predictions.tolist(),
            "probabilities": probabilities.tolist(),
            "model_version": model_version,
            "latency_ms": (time() - start_time) * 1000
        }
        
        logging.info(f"Processed {len(features)} samples in {result['latency_ms']:.2f}ms")
        
        return json.dumps(result)
        
    except Exception as e:
        error = {
            "error": str(e),
            "error_type": type(e).__name__
        }
        logging.error(f"Prediction failed: {error}")
        return json.dumps(error)

Invoke Endpoint

Using Python SDK

# Test with sample data
sample_data = {
    "features": [
        [5.1, 3.5, 1.4, 0.2],
        [6.2, 2.9, 4.3, 1.3],
        [5.7, 2.8, 4.1, 1.3]
    ]
}

# Invoke endpoint
response = ml_client.online_endpoints.invoke(
    endpoint_name="fraud-detection-api",
    request_file="request.json",  # or pass data directly
    deployment_name="blue"  # Optional: target specific deployment
)

print(f"Response: {response}")

Using REST API

import requests
import json

# Get endpoint details
endpoint = ml_client.online_endpoints.get("fraud-detection-api")
keys = ml_client.online_endpoints.get_keys("fraud-detection-api")

# Make request
headers = {
    "Authorization": f"Bearer {keys.primary_key}",
    "Content-Type": "application/json"
}

response = requests.post(
    endpoint.scoring_uri,
    headers=headers,
    json=sample_data
)

if response.status_code == 200:
    predictions = response.json()
    print(f"Predictions: {predictions}")
else:
    print(f"Error {response.status_code}: {response.text}")

Using cURL

KEY="<primary-key>"
URI="https://fraud-detection-api.eastus.inference.ml.azure.com/score"

curl -X POST $URI \
  -H "Authorization: Bearer $KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "features": [[5.1, 3.5, 1.4, 0.2]]
  }'

Traffic Management

Mirror Traffic (Shadow Testing)

Test new deployment without affecting production:
# Deploy new version
green_deployment = ManagedOnlineDeployment(
    name="green",
    endpoint_name="fraud-detection-api",
    model=new_model,
    instance_type="Standard_DS3_v2",
    instance_count=1
)
ml_client.online_deployments.begin_create_or_update(green_deployment).result()

# Mirror 10% of traffic to green for testing
endpoint.mirror_traffic = {"green": 10}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

# Green receives copy of 10% traffic but responses are not returned
# Monitor green metrics, then do actual traffic split

Gradual Rollout

# Phase 1: 10% to new version
endpoint.traffic = {"blue": 90, "green": 10}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
time.sleep(3600)  # Monitor for 1 hour

# Phase 2: 50/50 split
endpoint.traffic = {"blue": 50, "green": 50}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
time.sleep(3600)  # Monitor for 1 hour

# Phase 3: Complete rollout
endpoint.traffic = {"green": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

# Cleanup old deployment
ml_client.online_deployments.begin_delete(
    name="blue",
    endpoint_name="fraud-detection-api"
).result()

Autoscaling Configuration

from azure.ai.ml.entities import OnlineScaleSettings

# Target utilization autoscaling
scale_settings = OnlineScaleSettings(
    scale_type="TargetUtilization",
    min_instances=2,  # Always keep 2 instances minimum
    max_instances=10,  # Scale up to 10 instances
    target_utilization_percentage=70,  # Target 70% CPU utilization
    polling_interval=10  # Check every 10 seconds
)

deployment = ManagedOnlineDeployment(
    name="autoscale-deployment",
    endpoint_name="fraud-detection-api",
    model=model,
    instance_type="Standard_DS3_v2",
    scale_settings=scale_settings
)

ml_client.online_deployments.begin_create_or_update(deployment).result()

Monitoring and Logging

View Deployment Logs

# Get recent logs
logs = ml_client.online_deployments.get_logs(
    name="blue",
    endpoint_name="fraud-detection-api",
    lines=500,
    container_type="inference-server"  # or "storage-initializer"
)

print(logs)

Query Metrics

from azure.monitor.query import MetricsQueryClient
from azure.identity import DefaultAzureCredential
from datetime import timedelta

credential = DefaultAzureCredential()
metrics_client = MetricsQueryClient(credential)

# Get endpoint resource ID
endpoint = ml_client.online_endpoints.get("fraud-detection-api")
resource_id = endpoint.id

# Query metrics
metrics = metrics_client.query_resource(
    resource_id,
    metric_names=["RequestLatency", "RequestsPerMinute"],
    timespan=timedelta(hours=1)
)

for metric in metrics.metrics:
    print(f"{metric.name}: {metric.timeseries}")

Key Metrics

MetricDescriptionThreshold
Request Latency (P95)95th percentile response time<500ms
Requests Per MinuteThroughput-
HTTP 2xx RateSuccess rate>99%
HTTP 4xx RateClient errors<1%
HTTP 5xx RateServer errors<0.1%
CPU UtilizationCompute usage<80%
Memory UtilizationRAM usage<80%
Instance CountActive instances-

Security Best Practices

Enable system-assigned identity for secure access:
endpoint = ManagedOnlineEndpoint(
    name="secure-endpoint",
    auth_mode="aml_token",
    identity={
        "type": "SystemAssigned"
    }
)
Disable public access for sensitive workloads:
endpoint = ManagedOnlineEndpoint(
    name="private-endpoint",
    public_network_access="disabled"
)
# Regenerate primary key
ml_client.online_endpoints.regenerate_keys(
    name="fraud-detection-api",
    key_type="primary"
)
Encrypt data at rest with your own keys:
endpoint = ManagedOnlineEndpoint(
    name="cmk-endpoint",
    encryption={
        "status": "Enabled",
        "key_vault_properties": {
            "key_vault_uri": "https://myvault.vault.azure.net",
            "key_name": "my-key",
            "key_version": "abc123"
        }
    }
)

Performance Optimization

  • Convert to ONNX format
  • Apply quantization
  • Prune unnecessary layers
  • Use model distillation
import torch
from torch import onnx

# Convert PyTorch to ONNX
dummy_input = torch.randn(1, 3, 224, 224)
torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    opset_version=11,
    input_names=['input'],
    output_names=['output']
)

Cost Management

# View endpoint costs
from azure.mgmt.costmanagement import CostManagementClient
from azure.identity import DefaultAzureCredential

cost_client = CostManagementClient(
    credential=DefaultAzureCredential(),
    subscription_id="<subscription-id>"
)

# Query costs for last 30 days
query = {
    "type": "Usage",
    "timeframe": "MonthToDate",
    "dataset": {
        "granularity": "Daily",
        "filter": {
            "tags": {
                "name": "endpoint",
                "value": "fraud-detection-api"
            }
        }
    }
}

Next Steps

Batch Endpoints

Deploy for large-scale batch processing

Monitor Endpoints

Set up monitoring and alerts

MLOps

Automate deployment workflows

Troubleshooting

Debug deployment issues