diff --git a/src/bentoml/_internal/marshal/dispatcher.py b/src/bentoml/_internal/marshal/dispatcher.py index c086e61007c..3a267542a2c 100644 --- a/src/bentoml/_internal/marshal/dispatcher.py +++ b/src/bentoml/_internal/marshal/dispatcher.py @@ -354,7 +354,10 @@ async def controller(self): a = self.optimizer.o_a b = self.optimizer.o_b - if n > 1 and (w0 + a * n + b) >= self.max_latency_in_ms: + # the estimated latency of the first request if we began processing now + latency_0 = w0 + a * n + b + + if n > 1 and latency_0 >= self.max_latency_in_ms: self._queue.popleft()[2].cancel() continue if self._sema.is_locked(): @@ -363,12 +366,23 @@ async def controller(self): continue await asyncio.sleep(self.tick_interval) continue - if ( + + # we are now free to dispatch whenever we like + while ( + # if we don't already have enough requests, n < self.max_batch_size - and n * (wn + dt + (a or 0)) <= self.optimizer.wait * decay + # we are not about to cancel the first request, + and latency_0 + dt <= self.max_latency_in_ms * 0.95 + # and waiting will cause average latency to decrese + and n * (wn + dt + a) <= self.optimizer.wait * decay ): + n = len(self._queue) + now = time.time() + wn = now - self._queue[-1][0] + latency_0 = w0 + a * n + b + + # wait for additional requests to arrive await asyncio.sleep(self.tick_interval) - continue n_call_out = min(self.max_batch_size, n) # call