lokiwriter safer log server management
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -12,6 +13,13 @@ type LokiWriter struct {
|
|||||||
url string
|
url string
|
||||||
labels map[string]string
|
labels map[string]string
|
||||||
httpClient *http.Client
|
httpClient *http.Client
|
||||||
|
|
||||||
|
// Circuit breaker fields
|
||||||
|
mu sync.RWMutex
|
||||||
|
failureCount int
|
||||||
|
circuitOpen bool
|
||||||
|
lastFailureTime time.Time
|
||||||
|
lastWarningTime time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
type LokiPayload struct {
|
type LokiPayload struct {
|
||||||
@@ -23,6 +31,13 @@ type LokiStream struct {
|
|||||||
Values [][]string `json:"values"`
|
Values [][]string `json:"values"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
// Circuit breaker configuration
|
||||||
|
maxFailures = 3 // Open circuit after this many consecutive failures
|
||||||
|
circuitOpenTime = 5 * time.Minute // How long to keep circuit open
|
||||||
|
warningInterval = 1 * time.Minute // Minimum time between warning messages
|
||||||
|
)
|
||||||
|
|
||||||
func NewLokiWriter(url string, labels map[string]string) *LokiWriter {
|
func NewLokiWriter(url string, labels map[string]string) *LokiWriter {
|
||||||
return &LokiWriter{
|
return &LokiWriter{
|
||||||
url: url,
|
url: url,
|
||||||
@@ -32,10 +47,30 @@ func NewLokiWriter(url string, labels map[string]string) *LokiWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (w *LokiWriter) Write(p []byte) (n int, err error) {
|
func (w *LokiWriter) Write(p []byte) (n int, err error) {
|
||||||
|
// Check circuit breaker status
|
||||||
|
w.mu.RLock()
|
||||||
|
if w.circuitOpen {
|
||||||
|
// Check if it's time to retry
|
||||||
|
if time.Since(w.lastFailureTime) < circuitOpenTime {
|
||||||
|
w.mu.RUnlock()
|
||||||
|
// Circuit is open, silently discard log to avoid spam
|
||||||
|
return len(p), nil
|
||||||
|
}
|
||||||
|
w.mu.RUnlock()
|
||||||
|
// Time to retry - acquire write lock to close circuit
|
||||||
|
w.mu.Lock()
|
||||||
|
w.circuitOpen = false
|
||||||
|
w.failureCount = 0
|
||||||
|
w.mu.Unlock()
|
||||||
|
} else {
|
||||||
|
w.mu.RUnlock()
|
||||||
|
}
|
||||||
|
|
||||||
// Use zerolog to parse the log level
|
// Use zerolog to parse the log level
|
||||||
var event map[string]interface{}
|
var event map[string]interface{}
|
||||||
if err := json.Unmarshal(p, &event); err != nil {
|
if err := json.Unmarshal(p, &event); err != nil {
|
||||||
return 0, fmt.Errorf("failed to unmarshal log event: %w", err)
|
// Don't fail on unmarshal errors, just silently continue
|
||||||
|
return len(p), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
level := ""
|
level := ""
|
||||||
@@ -71,27 +106,61 @@ func (w *LokiWriter) Write(p []byte) (n int, err error) {
|
|||||||
|
|
||||||
payloadBytes, err := json.Marshal(payload)
|
payloadBytes, err := json.Marshal(payload)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("failed to marshal payload: %w", err)
|
// Don't fail on marshal errors
|
||||||
|
return len(p), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
//fmt.Printf("Sending payload to Loki: %s\n", string(payloadBytes))
|
|
||||||
|
|
||||||
req, err := http.NewRequest("POST", w.url, bytes.NewReader(payloadBytes))
|
req, err := http.NewRequest("POST", w.url, bytes.NewReader(payloadBytes))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("failed to create HTTP request: %w", err)
|
w.recordFailure(fmt.Sprintf("failed to create HTTP request: %v", err))
|
||||||
|
return len(p), nil
|
||||||
}
|
}
|
||||||
req.Header.Set("Content-Type", "application/json")
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
|
||||||
resp, err := w.httpClient.Do(req)
|
resp, err := w.httpClient.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("failed to send log to Loki: %w", err)
|
w.recordFailure(fmt.Sprintf("failed to send log to Loki: %v", err))
|
||||||
|
return len(p), nil
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
//fmt.Printf("Loki response status: %d\n", resp.StatusCode)
|
|
||||||
if resp.StatusCode != http.StatusNoContent {
|
if resp.StatusCode != http.StatusNoContent {
|
||||||
return 0, fmt.Errorf("received non-204 response from Loki: %d", resp.StatusCode)
|
w.recordFailure(fmt.Sprintf("received non-204 response from Loki: %d", resp.StatusCode))
|
||||||
|
return len(p), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Success - reset failure count
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.failureCount > 0 {
|
||||||
|
// Circuit was previously failing but now recovered
|
||||||
|
fmt.Printf("LokiWriter: connection restored to %s\n", w.url)
|
||||||
|
w.failureCount = 0
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
return len(p), nil
|
return len(p), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// recordFailure handles a Loki write failure and opens circuit if needed
|
||||||
|
func (w *LokiWriter) recordFailure(errMsg string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
|
||||||
|
w.failureCount++
|
||||||
|
w.lastFailureTime = time.Now()
|
||||||
|
|
||||||
|
// Only print warnings if enough time has passed since last warning
|
||||||
|
shouldWarn := time.Since(w.lastWarningTime) >= warningInterval
|
||||||
|
|
||||||
|
if w.failureCount >= maxFailures && !w.circuitOpen {
|
||||||
|
w.circuitOpen = true
|
||||||
|
if shouldWarn {
|
||||||
|
fmt.Printf("LokiWriter: circuit breaker opened after %d failures (last error: %s). Remote logging disabled for %v.\n",
|
||||||
|
w.failureCount, errMsg, circuitOpenTime)
|
||||||
|
w.lastWarningTime = time.Now()
|
||||||
|
}
|
||||||
|
} else if !w.circuitOpen && shouldWarn {
|
||||||
|
fmt.Printf("LokiWriter: warning - %s (failure %d/%d)\n", errMsg, w.failureCount, maxFailures)
|
||||||
|
w.lastWarningTime = time.Now()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user