diff --git a/src/bentoml/_internal/marshal/dispatcher.py b/src/bentoml/_internal/marshal/dispatcher.py
index c086e61007c..3a267542a2c 100644
--- a/src/bentoml/_internal/marshal/dispatcher.py
+++ b/src/bentoml/_internal/marshal/dispatcher.py
@@ -354,7 +354,10 @@ async def controller(self):
                 a = self.optimizer.o_a
                 b = self.optimizer.o_b
 
-                if n > 1 and (w0 + a * n + b) >= self.max_latency_in_ms:
+                # the estimated latency of the first request if we began processing now
+                latency_0 = w0 + a * n + b
+
+                if n > 1 and latency_0 >= self.max_latency_in_ms:
                     self._queue.popleft()[2].cancel()
                     continue
                 if self._sema.is_locked():
@@ -363,12 +366,23 @@ async def controller(self):
                         continue
                     await asyncio.sleep(self.tick_interval)
                     continue
-                if (
+
+                # we are now free to dispatch whenever we like
+                while (
+                    # if we don't already have enough requests,
                     n < self.max_batch_size
-                    and n * (wn + dt + (a or 0)) <= self.optimizer.wait * decay
+                    # we are not about to cancel the first request,
+                    and latency_0 + dt <= self.max_latency_in_ms * 0.95
+                    # and waiting will cause average latency to decrese
+                    and n * (wn + dt + a) <= self.optimizer.wait * decay
                 ):
+                    n = len(self._queue)
+                    now = time.time()
+                    wn = now - self._queue[-1][0]
+                    latency_0 = w0 + a * n + b
+
+                    # wait for additional requests to arrive
                     await asyncio.sleep(self.tick_interval)
-                    continue
 
                 n_call_out = min(self.max_batch_size, n)
                 # call