diff --git a/lokiwriter.go b/lokiwriter.go index 7afecf1..6422425 100644 --- a/lokiwriter.go +++ b/lokiwriter.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "net/http" + "sync" "time" ) @@ -12,6 +13,13 @@ type LokiWriter struct { url string labels map[string]string httpClient *http.Client + + // Circuit breaker fields + mu sync.RWMutex + failureCount int + circuitOpen bool + lastFailureTime time.Time + lastWarningTime time.Time } type LokiPayload struct { @@ -23,6 +31,13 @@ type LokiStream struct { Values [][]string `json:"values"` } +const ( + // Circuit breaker configuration + maxFailures = 3 // Open circuit after this many consecutive failures + circuitOpenTime = 5 * time.Minute // How long to keep circuit open + warningInterval = 1 * time.Minute // Minimum time between warning messages +) + func NewLokiWriter(url string, labels map[string]string) *LokiWriter { return &LokiWriter{ url: url, @@ -32,10 +47,30 @@ func NewLokiWriter(url string, labels map[string]string) *LokiWriter { } func (w *LokiWriter) Write(p []byte) (n int, err error) { + // Check circuit breaker status + w.mu.RLock() + if w.circuitOpen { + // Check if it's time to retry + if time.Since(w.lastFailureTime) < circuitOpenTime { + w.mu.RUnlock() + // Circuit is open, silently discard log to avoid spam + return len(p), nil + } + w.mu.RUnlock() + // Time to retry - acquire write lock to close circuit + w.mu.Lock() + w.circuitOpen = false + w.failureCount = 0 + w.mu.Unlock() + } else { + w.mu.RUnlock() + } + // Use zerolog to parse the log level var event map[string]interface{} if err := json.Unmarshal(p, &event); err != nil { - return 0, fmt.Errorf("failed to unmarshal log event: %w", err) + // Don't fail on unmarshal errors, just silently continue + return len(p), nil } level := "" @@ -71,27 +106,61 @@ func (w *LokiWriter) Write(p []byte) (n int, err error) { payloadBytes, err := json.Marshal(payload) if err != nil { - return 0, fmt.Errorf("failed to marshal payload: %w", err) + // Don't fail on marshal errors + return len(p), nil } - //fmt.Printf("Sending payload to Loki: %s\n", string(payloadBytes)) - req, err := http.NewRequest("POST", w.url, bytes.NewReader(payloadBytes)) if err != nil { - return 0, fmt.Errorf("failed to create HTTP request: %w", err) + w.recordFailure(fmt.Sprintf("failed to create HTTP request: %v", err)) + return len(p), nil } req.Header.Set("Content-Type", "application/json") resp, err := w.httpClient.Do(req) if err != nil { - return 0, fmt.Errorf("failed to send log to Loki: %w", err) + w.recordFailure(fmt.Sprintf("failed to send log to Loki: %v", err)) + return len(p), nil } defer resp.Body.Close() - //fmt.Printf("Loki response status: %d\n", resp.StatusCode) if resp.StatusCode != http.StatusNoContent { - return 0, fmt.Errorf("received non-204 response from Loki: %d", resp.StatusCode) + w.recordFailure(fmt.Sprintf("received non-204 response from Loki: %d", resp.StatusCode)) + return len(p), nil } + // Success - reset failure count + w.mu.Lock() + if w.failureCount > 0 { + // Circuit was previously failing but now recovered + fmt.Printf("LokiWriter: connection restored to %s\n", w.url) + w.failureCount = 0 + } + w.mu.Unlock() + return len(p), nil } + +// recordFailure handles a Loki write failure and opens circuit if needed +func (w *LokiWriter) recordFailure(errMsg string) { + w.mu.Lock() + defer w.mu.Unlock() + + w.failureCount++ + w.lastFailureTime = time.Now() + + // Only print warnings if enough time has passed since last warning + shouldWarn := time.Since(w.lastWarningTime) >= warningInterval + + if w.failureCount >= maxFailures && !w.circuitOpen { + w.circuitOpen = true + if shouldWarn { + fmt.Printf("LokiWriter: circuit breaker opened after %d failures (last error: %s). Remote logging disabled for %v.\n", + w.failureCount, errMsg, circuitOpenTime) + w.lastWarningTime = time.Now() + } + } else if !w.circuitOpen && shouldWarn { + fmt.Printf("LokiWriter: warning - %s (failure %d/%d)\n", errMsg, w.failureCount, maxFailures) + w.lastWarningTime = time.Now() + } +}