kubernetes-sigs · kaushikmitr · Feb 27, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml
@@ -120,4 +120,4 @@ data:
           - base-model: meta-llama/Llama-2-7b-hf
             id: tweet-summary-1
             source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
-  
+  
diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md
@@ -47,3 +47,4 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro
   requested adapter. Example: `"max_lora": "8"`.
   * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU
     memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"`
+  * `waiting_lora_adapters`: A comma separated list of adapters that are waiting to be served. Example: `"waiting_lora_adapters": "adapter1, adapter2"`
diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
@@ -34,9 +34,13 @@ import (
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
+// Metric names used in the vLLM metrics implementation.
+// Refer to the protocol doc for more details:
+// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol
 const (
 	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
 	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
 	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
@@ -45,8 +49,7 @@ const (
 	RunningQueueSizeMetricName        = "vllm:num_tokens_running"
 	WaitingQueueSizeMetricName        = "vllm:num_tokens_waiting"
 	*/
-	KVCacheUsagePercentMetricName     = "vllm:gpu_cache_usage_perc"
-	KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity"
+	KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc"
 )
 
 type PodMetricsClientImpl struct{}
@@ -136,6 +139,14 @@ func promToPodMetrics(
 					}
 				}
 			}
+			if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.ActiveModels[adapter] = 0
+					}
+				}
+			}
 			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
 				if label.GetValue() != "" {
 					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
@@ -161,14 +172,41 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
 		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
 	}
-	var latestTs float64
+
 	var latest *dto.Metric
+	var latestTs float64
+
+	// Iterate over all metrics in the family.
 	for _, m := range loraRequests.GetMetric() {
+		var running, waiting string
+		// Read the label values for running and waiting adapters.
+		for _, lp := range m.GetLabel() {
+			switch lp.GetName() {
+			case LoraRequestInfoRunningAdaptersMetricName:
+				running = lp.GetValue()
+			case LoraRequestInfoWaitingAdaptersMetricName:
+				waiting = lp.GetValue()
+			}
+		}
+
+		// Ignore metrics with both labels empty.
-		// Ignore metrics with both labels empty.
+		// Ignore metrics with both labels empty. This happens when there are no running or waiting requests on 
+		// the server, in this case it is best to use the last set of active adapters.
-		// Ignore metrics with both labels empty.
+		// Ignore metrics with both labels empty. This happens when there are no running or waiting requests on 
+		// the server, in this case it is best to use the last set of active adapters.
+		if running == "" && waiting == "" {
+			continue
+		}
+
+		// Select the metric with the latest creation timestamp.
 		if m.GetGauge().GetValue() > latestTs {
 			latestTs = m.GetGauge().GetValue()
 			latest = m
 		}
 	}
+
+	if latest == nil {
+		logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName)
+		return nil, time.Time{}, nil
+	}
+
+	// Convert the gauge value (creation timestamp) to time.Time.
 	return latest, time.Unix(0, int64(latestTs*1000)), nil
 }
 

diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go
@@ -19,6 +19,8 @@ package scheduling
 import (
 	"errors"
 	"math"
+	"math/rand"
+	"time"
 
 	"github.com/go-logr/logr"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
@@ -183,18 +185,59 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
 	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
-// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
-func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
-	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
-	return ok
-}
+// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods
+// with existing LoRA model affinity while allowing for load balancing through randomization.
+//
+// The function works by:
+// 1. Separating pods into two groups: those with target model affinity and those with available capacity
+// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing
+// 3. Falling back to whatever group has pods if one group is empty
+//
+// Parameters:
+//   - logger: Logger interface for diagnostic output
+//   - req: LLM request containing the resolved target model
+//   - pods: Slice of pod metrics to filter
+//
+// Returns:
+//   - Filtered slice of pod metrics based on affinity and availability
+//   - Error if any issues occur during filtering
+func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {
+
+	// Pre-allocate slices with estimated capacity
+	filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods))
+	filtered_available := make([]*datastore.PodMetrics, 0, len(pods))
+
+	// Categorize pods based on affinity and availability
+	for _, pod := range pods {
+
+		if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists {
+			filtered_affinity = append(filtered_affinity, pod)
+		} else if len(pod.ActiveModels) < pod.MaxActiveModels {
+			filtered_available = append(filtered_available, pod)
+		}
+	}
+
+	// Use crypto/rand for better randomization in production environments
+	randSource := rand.NewSource(time.Now().UnixNano())
+	randGen := rand.New(randSource)
+
+	// If both groups have pods, use probability to select which group to return
+	if len(filtered_affinity) > 0 && len(filtered_available) > 0 {
+		if randGen.Float64() < loraAffinityThreshold {
+			return filtered_affinity, nil
+		}
+		return filtered_available, nil
+	}
+
+	// Return whichever group has pods
+	if len(filtered_affinity) > 0 {
+		return filtered_affinity, nil
+	}
 
-// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
-func canAcceptNewLoraPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
-	return len(pod.ActiveModels) < pod.MaxActiveModels
+	return filtered_available, nil
 }
 
-func criticalRequestPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
+func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool {
 	return req.Critical
 }
 

diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go
@@ -36,8 +36,11 @@ const (
 	queueThresholdCritical = 5
 	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
 	// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
-	// The value of 50 is arrived heuristicically based on experiments.
-	queueingThresholdLoRA = 50
+	// The value of 128 is arrived heuristicically based on experiments.
+	queueingThresholdLoRA = 128
+	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
+	// loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters.
+	loraAffinityThreshold = 0.999
 )
 
 var (
@@ -54,7 +57,7 @@ var (
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
 			name:   "low cost LoRA",
-			filter: toFilterFunc(lowLoRACostPredicate),
+			filter: loRASoftAffinityFilter,
 			nextOnSuccessOrFailure: &filter{
 				name:   "least KV cache percent",
 				filter: leastKVCacheFilterFunc,
@@ -76,14 +79,9 @@ var (
 		name:   "low queueing filter",
 		filter: toFilterFunc((lowQueueingPodPredicate)),
 		nextOnSuccess: &filter{
-			name:          "affinity LoRA",
-			filter:        toFilterFunc(loRAAffinityPredicate),
-			nextOnSuccess: queueAndKVCacheFilter,
-			nextOnFailure: &filter{
-				name:                   "can accept LoRA Adapter",
-				filter:                 toFilterFunc(canAcceptNewLoraPredicate),
-				nextOnSuccessOrFailure: queueAndKVCacheFilter,
-			},
+			name:                   "affinity LoRA",
+			filter:                 loRASoftAffinityFilter,
+			nextOnSuccessOrFailure: queueAndKVCacheFilter,
 		},
 		nextOnFailure: queueLoRAAndKVCacheFilter,
 	}

diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go
@@ -179,7 +179,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
 					},
 				}),
 				extprocutils.FakePodMetrics(1, datastore.Metrics{
-					WaitingQueueSize:    50,
+					WaitingQueueSize:    200,
 					KVCacheUsagePercent: 0.1,
 					ActiveModels: map[string]int{
 						"foo":            1,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -120,4 +120,4 @@ data:
		- base-model: meta-llama/Llama-2-7b-hf
		id: tweet-summary-1
		source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm