kubernetes-sigs · kaushikmitr · Feb 27, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml
@@ -3,7 +3,7 @@ kind: Deployment
 metadata:
   name: vllm-llama2-7b-pool
 spec:
-  replicas: 3
+  replicas: 6
   selector:
     matchLabels:
       app: vllm-llama2-7b-pool
@@ -24,15 +24,23 @@ spec:
           - "1"
           - "--port"
           - "8000"
+          - "--compilation-config"
+          - "3"
+          - "--max-num-seqs"
+          - "2048"
           - "--enable-lora"
           - "--max-loras"
           - "4"
           - "--max-cpu-loras"
-          - "12"
+          - "15"
+          - "--max-lora-rank"
+          - "16"
           - "--lora-modules"
           - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
+            - name: VLLM_USE_V1
+              value: "1"
             - name: PORT
               value: "8000"
             - name: HUGGING_FACE_HUB_TOKEN

diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
@@ -37,6 +37,7 @@ import (
 const (
 	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
 	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
 	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
@@ -136,6 +137,14 @@ func promToPodMetrics(
 					}
 				}
 			}
+			if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.ActiveModels[adapter] = 0
+					}
+				}
+			}
 			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
 				if label.GetValue() != "" {
 					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
@@ -161,14 +170,40 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
 		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
 	}
-	var latestTs float64
+
 	var latest *dto.Metric
+	var latestTs float64
+
+	// Iterate over all metrics in the family.
 	for _, m := range loraRequests.GetMetric() {
+		var running, waiting string
+		// Read the label values for running and waiting adapters.
+		for _, lp := range m.GetLabel() {
+			switch lp.GetName() {
+			case LoraRequestInfoRunningAdaptersMetricName:
+				running = lp.GetValue()
+			case LoraRequestInfoWaitingAdaptersMetricName:
+				waiting = lp.GetValue()
+			}
+		}
+
+		// Ignore metrics with both labels empty.
-		// Ignore metrics with both labels empty.
+		// Ignore metrics with both labels empty. This happens when there are no running or waiting requests on 
+		// the server, in this case it is best to use the last set of active adapters.
-		// Ignore metrics with both labels empty.
+		// Ignore metrics with both labels empty. This happens when there are no running or waiting requests on 
+		// the server, in this case it is best to use the last set of active adapters.
+		if running == "" && waiting == "" {
+			//	continue
+		}
+
+		// Select the metric with the latest creation timestamp.
 		if m.GetGauge().GetValue() > latestTs {
 			latestTs = m.GetGauge().GetValue()
 			latest = m
 		}
 	}
+
+	if latest == nil {
+		return nil, time.Time{}, fmt.Errorf("no valid metric found")
+	}
+
+	// Convert the gauge value (creation timestamp) to time.Time.
 	return latest, time.Unix(0, int64(latestTs*1000)), nil
 }
 

diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go
@@ -19,6 +19,8 @@ package scheduling
 import (
 	"errors"
 	"math"
+	"math/rand"
+	"time"
 
 	"github.com/go-logr/logr"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
@@ -183,10 +185,59 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
 	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
-// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
-func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
-	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
-	return ok
+// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods
+// with existing LoRA model affinity while allowing for load balancing through randomization.
+//
+// The function works by:
+// 1. Separating pods into two groups: those with target model affinity and those with available capacity
+// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing
+// 3. Falling back to whatever group has pods if one group is empty
+//
+// Parameters:
+//   - logger: Logger interface for diagnostic output
+//   - req: LLM request containing the resolved target model
+//   - pods: Slice of pod metrics to filter
+//
+// Returns:
+//   - Filtered slice of pod metrics based on affinity and availability
+//   - Error if any issues occur during filtering
+func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {
+
+	// Pre-allocate slices with estimated capacity
+	filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods))
+	filtered_available := make([]*datastore.PodMetrics, 0, len(pods))
+
+	// Categorize pods based on affinity and availability
+	for _, pod := range pods {
+		if pod == nil {
+			continue
+		}
+
+		if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists {
+			filtered_affinity = append(filtered_affinity, pod)
+		} else if len(pod.ActiveModels) < pod.MaxActiveModels {
+			filtered_available = append(filtered_available, pod)
+		}
+	}
+
+	// Use crypto/rand for better randomization in production environments
+	randSource := rand.NewSource(time.Now().UnixNano())
+	randGen := rand.New(randSource)
+
+	// If both groups have pods, use probability to select which group to return
+	if len(filtered_affinity) > 0 && len(filtered_available) > 0 {
+		if randGen.Float64() < loraAffinityThreshold {
+			return filtered_affinity, nil
+		}
+		return filtered_available, nil
+	}
+
+	// Return whichever group has pods
+	if len(filtered_affinity) > 0 {
+		return filtered_affinity, nil
+	}
+
+	return filtered_available, nil
 }
 
 // canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.

diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go
@@ -36,8 +36,11 @@ const (
 	queueThresholdCritical = 5
 	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
 	// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
-	// The value of 50 is arrived heuristicically based on experiments.
-	queueingThresholdLoRA = 50
+	// The value of 128 is arrived heuristicically based on experiments.
+	queueingThresholdLoRA = 128
+	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
+	// loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters.
+	loraAffinityThreshold = 0.999
 )
 
 var (
@@ -54,7 +57,7 @@ var (
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
 			name:   "low cost LoRA",
-			filter: toFilterFunc(lowLoRACostPredicate),
+			filter: loRASoftAffinityPredicate,
 			nextOnSuccessOrFailure: &filter{
 				name:   "least KV cache percent",
 				filter: leastKVCacheFilterFunc,
@@ -76,14 +79,9 @@ var (
 		name:   "low queueing filter",
 		filter: toFilterFunc((lowQueueingPodPredicate)),
 		nextOnSuccess: &filter{
-			name:          "affinity LoRA",
-			filter:        toFilterFunc(loRAAffinityPredicate),
-			nextOnSuccess: queueAndKVCacheFilter,
-			nextOnFailure: &filter{
-				name:                   "can accept LoRA Adapter",
-				filter:                 toFilterFunc(canAcceptNewLoraPredicate),
-				nextOnSuccessOrFailure: queueAndKVCacheFilter,
-			},
+			name:                   "affinity LoRA",
+			filter:                 loRASoftAffinityPredicate,
+			nextOnSuccessOrFailure: queueAndKVCacheFilter,
 		},
 		nextOnFailure: queueLoRAAndKVCacheFilter,
 	}