sgl-project · merrymercy · Nov 22, 2024 · Nov 22, 2024
@@ -136,8 +136,8 @@ def __init__(
                 )
         else:
             self.recv_from_tokenizer = None
-            self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda x: None)
-            self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda x: None)
+            self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda _: None)
+            self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda _: None)
 
         # Init tokenizer
         self.model_config = ModelConfig(
@@ -1030,7 +1030,8 @@ def process_batch_result_prefill(self, batch: ScheduleBatch, result):
                 else:
                     self.tree_cache.cache_unfinished_req(req)
 
-        self.stream_output(batch.reqs)
+        if self.tp_rank == 0:
+            self.stream_output(batch.reqs)
 
     def process_batch_result_decode(self, batch: ScheduleBatch, result):
         logits_output, next_token_ids, bid = result
@@ -1081,7 +1082,8 @@ def process_batch_result_decode(self, batch: ScheduleBatch, result):
             torch.cuda.current_stream().synchronize()
             batch.next_batch_sampling_info.sampling_info_done.set()
 
-        self.stream_output(batch.reqs)
+        if self.tp_rank == 0:
+            self.stream_output(batch.reqs)
 
         self.token_to_kv_pool.free_group_end()
 

@@ -181,7 +181,7 @@ def init_torch_distributed(self):
         if self.device == "cuda":
             torch.cuda.set_device(self.gpu_id)
             backend = "nccl"
-        # ToDO(liangan1):Just use gloo to bypass the initilization fail
+        # TODO(liangan1): Just use gloo to bypass the initilization fail
         # Need to use xccl for xpu backend in the future
         elif self.device == "xpu":
             torch.xpu.set_device(self.gpu_id)