Documentation Index Fetch the complete documentation index at: https://mintlify.com/MicrosoftDocs/azure-ai-docs/llms.txt
Use this file to discover all available pages before exploring further.
Online Endpoints for Real-Time Inference
Online endpoints provide real-time inference for machine learning models with low latency, automatic scaling, and built-in monitoring.
Managed online endpoints handle infrastructure, scaling, and security automatically - you focus on your model.
What are Online Endpoints?
Online endpoints deploy models to web servers that return predictions via HTTP. They’re optimized for:
Low Latency Sub-second response times
Synchronous Requests Request-response pattern
Real-Time Scoring Immediate predictions
Auto Scaling Handle traffic spikes
When to Use Online Endpoints
Choose online endpoints when:
Low Latency Required
Need predictions in <1 second
Request-Response Pattern
Client waits for immediate response
Small Payloads
Input data fits in HTTP request
Variable Traffic
Need to scale based on requests
Managed Online Endpoints
Recommended deployment method with full infrastructure management.
Key Features
Infrastructure
Scaling
Monitoring
Security
Fully Managed:
Automatic compute provisioning
OS updates and patching
Node recovery on failure
Load balancing
SSL termination
Elastic Scaling:
Manual instance count control
Autoscaling based on metrics
Scale to zero support
Not bound by cluster size
Built-in Observability:
Azure Monitor integration
Log Analytics workspace
Custom metrics tracking
Cost breakdown per deployment
Performance dashboards
Enterprise-Ready:
Private Link support
Customer-managed keys
Microsoft Entra ID auth
Network isolation
RBAC integration
Create Online Endpoint
Step 1: Define Endpoint
from azure.ai.ml import MLClient
from azure.ai.ml.entities import ManagedOnlineEndpoint
from azure.identity import DefaultAzureCredential
ml_client = MLClient(
DefaultAzureCredential(),
subscription_id = "<subscription-id>" ,
resource_group = "<resource-group>" ,
workspace_name = "<workspace>"
)
# Create endpoint
endpoint = ManagedOnlineEndpoint(
name = "fraud-detection-api" ,
description = "Real-time fraud detection endpoint" ,
auth_mode = "key" , # or "aml_token" for Entra ID
tags = {
"environment" : "production" ,
"team" : "ml-ops" ,
"cost-center" : "engineering"
}
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print ( f "Endpoint URI: { endpoint.scoring_uri } " )
Step 2: Create Deployment
MLflow Model
Custom Scoring
from azure.ai.ml.entities import (
ManagedOnlineDeployment,
Model,
OnlineScaleSettings
)
deployment = ManagedOnlineDeployment(
name = "blue" ,
endpoint_name = "fraud-detection-api" ,
model = Model( path = "./mlflow-model" ),
instance_type = "Standard_DS3_v2" ,
instance_count = 2 ,
scale_settings = OnlineScaleSettings(
scale_type = "TargetUtilization" ,
min_instances = 1 ,
max_instances = 5 ,
target_utilization_percentage = 70
),
request_settings = {
"request_timeout_ms" : 90000 ,
"max_concurrent_requests_per_instance" : 3
},
liveness_probe = {
"initial_delay" : 10 ,
"period" : 10 ,
"timeout" : 2 ,
"failure_threshold" : 3
},
readiness_probe = {
"initial_delay" : 10 ,
"period" : 10 ,
"timeout" : 2 ,
"failure_threshold" : 3 ,
"success_threshold" : 1
},
environment_variables = {
"MODEL_VERSION" : "1.2.0" ,
"LOG_LEVEL" : "INFO"
}
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
Step 3: Route Traffic
# Route 100% traffic to blue deployment
endpoint.traffic = { "blue" : 100 }
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
print ( f "Endpoint ready: { endpoint.scoring_uri } " )
Scoring Script
For custom models, provide a scoring script:
# score.py
import json
import logging
import os
import joblib
import numpy as np
from time import time
def init ():
"""
Called when deployment is created or updated.
Load model and set up resources.
"""
global model
global model_version
# Get model path
model_path = os.path.join(
os.getenv( "AZUREML_MODEL_DIR" ),
"model.pkl"
)
# Load model
model = joblib.load(model_path)
model_version = os.getenv( "MODEL_VERSION" , "unknown" )
logging.info( f "Model loaded: version { model_version } " )
def run ( raw_data ):
"""
Called for each inference request.
Args:
raw_data: JSON string with input data
Returns:
JSON string with predictions
"""
start_time = time()
try :
# Parse input
data = json.loads(raw_data)
features = np.array(data[ "features" ])
# Make prediction
predictions = model.predict(features)
probabilities = model.predict_proba(features)
# Prepare response
result = {
"predictions" : predictions.tolist(),
"probabilities" : probabilities.tolist(),
"model_version" : model_version,
"latency_ms" : (time() - start_time) * 1000
}
logging.info( f "Processed { len (features) } samples in { result[ 'latency_ms' ] :.2f} ms" )
return json.dumps(result)
except Exception as e:
error = {
"error" : str (e),
"error_type" : type (e). __name__
}
logging.error( f "Prediction failed: { error } " )
return json.dumps(error)
Invoke Endpoint
Using Python SDK
# Test with sample data
sample_data = {
"features" : [
[ 5.1 , 3.5 , 1.4 , 0.2 ],
[ 6.2 , 2.9 , 4.3 , 1.3 ],
[ 5.7 , 2.8 , 4.1 , 1.3 ]
]
}
# Invoke endpoint
response = ml_client.online_endpoints.invoke(
endpoint_name = "fraud-detection-api" ,
request_file = "request.json" , # or pass data directly
deployment_name = "blue" # Optional: target specific deployment
)
print ( f "Response: { response } " )
Using REST API
import requests
import json
# Get endpoint details
endpoint = ml_client.online_endpoints.get( "fraud-detection-api" )
keys = ml_client.online_endpoints.get_keys( "fraud-detection-api" )
# Make request
headers = {
"Authorization" : f "Bearer { keys.primary_key } " ,
"Content-Type" : "application/json"
}
response = requests.post(
endpoint.scoring_uri,
headers = headers,
json = sample_data
)
if response.status_code == 200 :
predictions = response.json()
print ( f "Predictions: { predictions } " )
else :
print ( f "Error { response.status_code } : { response.text } " )
Using cURL
KEY = "<primary-key>"
URI = "https://fraud-detection-api.eastus.inference.ml.azure.com/score"
curl -X POST $URI \
-H "Authorization: Bearer $KEY " \
-H "Content-Type: application/json" \
-d '{
"features": [[5.1, 3.5, 1.4, 0.2]]
}'
Traffic Management
Mirror Traffic (Shadow Testing)
Test new deployment without affecting production:
# Deploy new version
green_deployment = ManagedOnlineDeployment(
name = "green" ,
endpoint_name = "fraud-detection-api" ,
model = new_model,
instance_type = "Standard_DS3_v2" ,
instance_count = 1
)
ml_client.online_deployments.begin_create_or_update(green_deployment).result()
# Mirror 10% of traffic to green for testing
endpoint.mirror_traffic = { "green" : 10 }
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
# Green receives copy of 10% traffic but responses are not returned
# Monitor green metrics, then do actual traffic split
Gradual Rollout
# Phase 1: 10% to new version
endpoint.traffic = { "blue" : 90 , "green" : 10 }
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
time.sleep( 3600 ) # Monitor for 1 hour
# Phase 2: 50/50 split
endpoint.traffic = { "blue" : 50 , "green" : 50 }
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
time.sleep( 3600 ) # Monitor for 1 hour
# Phase 3: Complete rollout
endpoint.traffic = { "green" : 100 }
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
# Cleanup old deployment
ml_client.online_deployments.begin_delete(
name = "blue" ,
endpoint_name = "fraud-detection-api"
).result()
Autoscaling Configuration
from azure.ai.ml.entities import OnlineScaleSettings
# Target utilization autoscaling
scale_settings = OnlineScaleSettings(
scale_type = "TargetUtilization" ,
min_instances = 2 , # Always keep 2 instances minimum
max_instances = 10 , # Scale up to 10 instances
target_utilization_percentage = 70 , # Target 70% CPU utilization
polling_interval = 10 # Check every 10 seconds
)
deployment = ManagedOnlineDeployment(
name = "autoscale-deployment" ,
endpoint_name = "fraud-detection-api" ,
model = model,
instance_type = "Standard_DS3_v2" ,
scale_settings = scale_settings
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
Monitoring and Logging
View Deployment Logs
# Get recent logs
logs = ml_client.online_deployments.get_logs(
name = "blue" ,
endpoint_name = "fraud-detection-api" ,
lines = 500 ,
container_type = "inference-server" # or "storage-initializer"
)
print (logs)
Query Metrics
from azure.monitor.query import MetricsQueryClient
from azure.identity import DefaultAzureCredential
from datetime import timedelta
credential = DefaultAzureCredential()
metrics_client = MetricsQueryClient(credential)
# Get endpoint resource ID
endpoint = ml_client.online_endpoints.get( "fraud-detection-api" )
resource_id = endpoint.id
# Query metrics
metrics = metrics_client.query_resource(
resource_id,
metric_names = [ "RequestLatency" , "RequestsPerMinute" ],
timespan = timedelta( hours = 1 )
)
for metric in metrics.metrics:
print ( f " { metric.name } : { metric.timeseries } " )
Key Metrics
Metric Description Threshold Request Latency (P95) 95th percentile response time <500ms Requests Per Minute Throughput - HTTP 2xx Rate Success rate >99% HTTP 4xx Rate Client errors <1% HTTP 5xx Rate Server errors <0.1% CPU Utilization Compute usage <80% Memory Utilization RAM usage <80% Instance Count Active instances -
Security Best Practices
Enable system-assigned identity for secure access: endpoint = ManagedOnlineEndpoint(
name = "secure-endpoint" ,
auth_mode = "aml_token" ,
identity = {
"type" : "SystemAssigned"
}
)
Disable public access for sensitive workloads: endpoint = ManagedOnlineEndpoint(
name = "private-endpoint" ,
public_network_access = "disabled"
)
# Regenerate primary key
ml_client.online_endpoints.regenerate_keys(
name = "fraud-detection-api" ,
key_type = "primary"
)
Use Customer-Managed Keys
Encrypt data at rest with your own keys: endpoint = ManagedOnlineEndpoint(
name = "cmk-endpoint" ,
encryption = {
"status" : "Enabled" ,
"key_vault_properties" : {
"key_vault_uri" : "https://myvault.vault.azure.net" ,
"key_name" : "my-key" ,
"key_version" : "abc123"
}
}
)
Model Optimization
Batch Predictions
Caching
GPU Acceleration
Convert to ONNX format
Apply quantization
Prune unnecessary layers
Use model distillation
import torch
from torch import onnx
# Convert PyTorch to ONNX
dummy_input = torch.randn( 1 , 3 , 224 , 224 )
torch.onnx.export(
model,
dummy_input,
"model.onnx" ,
opset_version = 11 ,
input_names = [ 'input' ],
output_names = [ 'output' ]
)
Process multiple requests together: # In score.py
def run ( raw_data ):
data = json.loads(raw_data)
features = np.array(data[ "features" ]) # Shape: (n, num_features)
# Model handles batch automatically
predictions = model.predict(features)
return json.dumps({ "predictions" : predictions.tolist()})
Implement response caching: from functools import lru_cache
import hashlib
@lru_cache ( maxsize = 1000 )
def cached_predict ( features_hash ):
return model.predict(features)
def run ( raw_data ):
data = json.loads(raw_data)
features = np.array(data[ "features" ])
# Create hash of features
features_hash = hashlib.md5(features.tobytes()).hexdigest()
predictions = cached_predict(features_hash)
return json.dumps({ "predictions" : predictions.tolist()})
Use GPU instances for deep learning: deployment = ManagedOnlineDeployment(
name = "gpu-deployment" ,
endpoint_name = "my-endpoint" ,
model = model,
instance_type = "Standard_NC6s_v3" , # 1x V100 GPU
instance_count = 2
)
Cost Management
# View endpoint costs
from azure.mgmt.costmanagement import CostManagementClient
from azure.identity import DefaultAzureCredential
cost_client = CostManagementClient(
credential = DefaultAzureCredential(),
subscription_id = "<subscription-id>"
)
# Query costs for last 30 days
query = {
"type" : "Usage" ,
"timeframe" : "MonthToDate" ,
"dataset" : {
"granularity" : "Daily" ,
"filter" : {
"tags" : {
"name" : "endpoint" ,
"value" : "fraud-detection-api"
}
}
}
}
Next Steps
Batch Endpoints Deploy for large-scale batch processing
Monitor Endpoints Set up monitoring and alerts
MLOps Automate deployment workflows
Troubleshooting Debug deployment issues