Monitoring and Alerting
Learning Objectives
Identify critical metrics for XRPL integration health
Implement logging and metrics collection
Configure alerts for anomalies and failures
Build dashboards for operational visibility
Respond to incidents with appropriate runbooks
class ConnectionMonitor {
constructor(metricsClient) {
this.metrics = metricsClient
this.connectionState = 'disconnected'
this.lastConnectedTime = null
this.disconnectCount = 0
}
recordConnect(server) {
this.connectionState = 'connected'
this.lastConnectedTime = Date.now()
this.metrics.gauge('xrpl.connection.state', 1, { server })
this.metrics.increment('xrpl.connection.connects', { server })
}
recordDisconnect(server, code, reason) {
this.connectionState = 'disconnected'
this.disconnectCount++
const connectedDuration = this.lastConnectedTime
? Date.now() - this.lastConnectedTime
: 0
this.metrics.gauge('xrpl.connection.state', 0, { server })
this.metrics.increment('xrpl.connection.disconnects', { server, code })
this.metrics.timing('xrpl.connection.duration', connectedDuration, { server })
console.log(Disconnected from ${server}: code=${code}, reason=${reason})
}
getHealth() {
return {
connected: this.connectionState === 'connected',
disconnectCount: this.disconnectCount,
lastConnected: this.lastConnectedTime
}
}
}
```
class RequestMonitor {
constructor(metricsClient) {
this.metrics = metricsClient
}
recordRequest(command, latencyMs, success, errorType = null) {
// Latency histogram
this.metrics.timing('xrpl.request.latency', latencyMs, { command })
// Success/failure counter
if (success) {
this.metrics.increment('xrpl.request.success', { command })
} else {
this.metrics.increment('xrpl.request.error', { command, error: errorType })
}
// Alert on high latency
if (latencyMs > 5000) {
console.warn(High latency request: ${command} took ${latencyMs}ms)
}
}
wrapClient(client) {
const originalRequest = client.request.bind(client)
const monitor = this
client.request = async function(request) {
const start = Date.now()
let success = true
let errorType = null
try {
return await originalRequest(request)
} catch (error) {
success = false
errorType = error.data?.error || error.name || 'unknown'
throw error
} finally {
monitor.recordRequest(request.command, Date.now() - start, success, errorType)
}
}
return client
}
}
```
class TransactionMonitor {
constructor(metricsClient) {
this.metrics = metricsClient
}
recordSubmission(txType, success, result, latencyMs) {
this.metrics.timing('xrpl.transaction.latency', latencyMs, { type: txType })
if (success) {
this.metrics.increment('xrpl.transaction.success', { type: txType })
} else {
this.metrics.increment('xrpl.transaction.failure', { type: txType, result })
}
}
recordPayment(amount, currency, success) {
if (success) {
this.metrics.increment('xrpl.payment.count', { currency })
if (currency === 'XRP') {
this.metrics.gauge('xrpl.payment.amount', amount, { currency })
this.metrics.increment('xrpl.payment.volume', amount)
}
}
}
recordWalletBalance(address, balanceXRP) {
this.metrics.gauge('xrpl.wallet.balance', balanceXRP, { address })
// Alert on low balance
if (balanceXRP < 100) {
console.warn(Low wallet balance: ${address} has ${balanceXRP} XRP)
}
}
}
```
const winston = require('winston')
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
defaultMeta: { service: 'xrpl-integration' },
transports: [
new winston.transports.File({ filename: 'error.log', level: 'error' }),
new winston.transports.File({ filename: 'combined.log' }),
],
})
// Structured logging examples
function logPaymentSent(payment, result) {
logger.info('Payment sent', {
event: 'payment_sent',
hash: result.hash,
destination: payment.destination,
amount: payment.amount,
currency: payment.currency || 'XRP',
result: result.result,
latencyMs: result.latencyMs
})
}
function logPaymentReceived(tx) {
logger.info('Payment received', {
event: 'payment_received',
hash: tx.hash,
from: tx.Account,
amount: tx.deliveredAmount,
destinationTag: tx.DestinationTag,
ledger: tx.ledger_index
})
}
function logError(error, context) {
logger.error('XRPL error', {
event: 'xrpl_error',
error: error.message,
code: error.data?.error || error.code,
stack: error.stack,
...context
})
}
```
class AuditLogger {
constructor(storage) {
this.storage = storage // Database, S3, etc.
}
async logTransaction(tx, metadata) {
const auditRecord = {
timestamp: new Date().toISOString(),
type: 'transaction',
hash: tx.hash,
account: tx.Account,
transactionType: tx.TransactionType,
amount: tx.Amount,
destination: tx.Destination,
result: tx.meta?.TransactionResult,
metadata: {
userId: metadata.userId,
requestId: metadata.requestId,
ipAddress: metadata.ipAddress
}
}
await this.storage.write(auditRecord)
}
async logSecurityEvent(event, details) {
const auditRecord = {
timestamp: new Date().toISOString(),
type: 'security',
event: event,
severity: details.severity || 'info',
details: details,
sourceIp: details.sourceIp
}
await this.storage.write(auditRecord)
// Immediate alert for high severity
if (details.severity === 'critical') {
await this.alertSecurityTeam(auditRecord)
}
}
}
```
const alertRules = {
connection: {
disconnected: {
condition: 'connection_state == 0',
duration: '1m',
severity: 'critical',
message: 'XRPL connection lost for over 1 minute'
},
highDisconnectRate: {
condition: 'rate(disconnects[5m]) > 5',
severity: 'warning',
message: 'High disconnect rate: more than 5 disconnects in 5 minutes'
}
},
requests: {
highErrorRate: {
condition: 'error_rate > 0.05', // 5%
duration: '5m',
severity: 'warning',
message: 'Request error rate above 5%'
},
highLatency: {
condition: 'p95_latency > 2000', // 2 seconds
duration: '5m',
severity: 'warning',
message: 'P95 request latency above 2 seconds'
}
},
transactions: {
failureSpike: {
condition: 'rate(transaction_failures[5m]) > 10',
severity: 'critical',
message: 'Transaction failure rate spiking'
},
pendingTransactions: {
condition: 'pending_transactions > 100',
duration: '10m',
severity: 'warning',
message: 'Many transactions pending validation'
}
},
wallet: {
lowBalance: {
condition: 'wallet_balance < 1000', // XRP
severity: 'warning',
message: 'Hot wallet balance below 1000 XRP'
},
criticalLowBalance: {
condition: 'wallet_balance < 100',
severity: 'critical',
message: 'Hot wallet balance critically low'
}
}
}
```
class AlertManager {
constructor(config) {
this.channels = config.channels // Slack, PagerDuty, email, etc.
this.activeAlerts = new Map()
this.cooldownMinutes = config.cooldownMinutes || 30
}
async checkAndAlert(alertId, condition, severity, message) {
if (!condition) {
// Condition resolved - clear alert
if (this.activeAlerts.has(alertId)) {
await this.resolveAlert(alertId)
}
return
}
// Check cooldown
const existingAlert = this.activeAlerts.get(alertId)
if (existingAlert) {
const minutesSince = (Date.now() - existingAlert.time) / 60000
if (minutesSince < this.cooldownMinutes) {
return // In cooldown
}
}
// Fire alert
await this.fireAlert(alertId, severity, message)
}
async fireAlert(alertId, severity, message) {
const alert = {
id: alertId,
severity,
message,
time: Date.now()
}
this.activeAlerts.set(alertId, alert)
// Route to appropriate channels
const channels = this.getChannelsForSeverity(severity)
for (const channel of channels) {
await channel.send(alert)
}
console.log(ALERT [${severity}]: ${message})
}
async resolveAlert(alertId) {
const alert = this.activeAlerts.get(alertId)
if (!alert) return
this.activeAlerts.delete(alertId)
const duration = Date.now() - alert.time
console.log(RESOLVED [${alert.severity}]: ${alert.message} (duration: ${duration}ms))
// Notify resolution
for (const channel of this.channels) {
await channel.sendResolution(alert, duration)
}
}
getChannelsForSeverity(severity) {
switch (severity) {
case 'critical':
return this.channels // All channels
case 'warning':
return this.channels.filter(c => c.type !== 'pagerduty')
default:
return this.channels.filter(c => c.type === 'slack')
}
}
}
```
const dashboardConfig = {
overview: {
panels: [
{
title: 'Connection Status',
type: 'status',
query: 'xrpl_connection_state',
thresholds: { good: 1, bad: 0 }
},
{
title: 'Request Rate',
type: 'graph',
query: 'rate(xrpl_requests_total[1m])'
},
{
title: 'Error Rate',
type: 'graph',
query: 'rate(xrpl_request_errors[1m]) / rate(xrpl_requests_total[1m])'
},
{
title: 'Request Latency (P95)',
type: 'graph',
query: 'histogram_quantile(0.95, xrpl_request_latency_bucket)'
}
]
},
transactions: {
panels: [
{
title: 'Transaction Success Rate',
type: 'gauge',
query: 'xrpl_transaction_success / xrpl_transaction_total'
},
{
title: 'Payments by Result',
type: 'piechart',
query: 'sum by (result) (xrpl_transactions_total)'
},
{
title: 'Payment Volume (XRP)',
type: 'graph',
query: 'sum(rate(xrpl_payment_volume[1h]))'
}
]
},
wallets: {
panels: [
{
title: 'Hot Wallet Balance',
type: 'gauge',
query: 'xrpl_wallet_balance{type="hot"}',
thresholds: { warning: 1000, critical: 100 }
},
{
title: 'Balance History',
type: 'graph',
query: 'xrpl_wallet_balance'
}
]
}
}
```
# XRPL Connection Lost Runbook
- **Name:** xrpl_connection_lost
- **Severity:** Critical
- **Condition:** Connection to XRPL network lost for >1 minute
- Unable to query account balances
- Unable to submit transactions
- Unable to monitor incoming payments
- Check server status: `curl https://s1.ripple.com:51234/`
- Check network connectivity: `ping s1.ripple.com`
- Check application logs for disconnect reason
- Check XRPL network status: https://xrpcharts.ripple.com/
- If single server issue, verify failover to backup servers
- If network issue, check infrastructure (firewall, DNS, etc.)
- If application issue, restart service: `systemctl restart xrpl-service`
- Verify reconnection in logs and metrics
- If not resolved in 15 minutes, page on-call engineer
- If affecting payments, notify finance team
class AutoRemediation {
constructor(client, alertManager) {
this.client = client
this.alertManager = alertManager
}
async handleConnectionLost() {
console.log('Auto-remediation: Attempting reconnection...')
for (let attempt = 1; attempt <= 3; attempt++) {
try {
await this.client.connect()
console.log('Auto-remediation: Reconnection successful')
return true
} catch (error) {
console.log(Auto-remediation: Attempt ${attempt} failed)
await sleep(5000 * attempt)
}
}
console.log('Auto-remediation: Failed, escalating to human')
await this.alertManager.escalate('connection_lost', 'Auto-remediation failed')
return false
}
async handleLowBalance(threshold) {
console.log('Auto-remediation: Low balance detected')
// Check if we have a replenishment process
const replenished = await this.requestReplenishment()
if (!replenished) {
await this.alertManager.escalate('low_balance', 'Replenishment failed')
}
}
}
```
- Tracks connection, request, and transaction metrics
- Implements structured logging
- Configures alerts for critical conditions
- Provides a dashboard view of system health
Time Investment: 3-4 hours
End of Lesson 16
Key Takeaways
Monitor connection state:
Most critical single metric.
Track latency percentiles:
Averages hide problems.
Alert on anomalies:
Rate of change, not just thresholds.
Use structured logging:
Makes debugging possible.
Maintain runbooks:
Keep them updated and tested. ---