From 333ef4e270928d55ff23b3e5f5af6df1b1d9841a Mon Sep 17 00:00:00 2001
From: Halldor Fannar <hfannar@nvidia.com>
Date: Mon, 19 Jan 2026 21:31:27 +0000
Subject: [PATCH 1/5] Add benchmark script to establish baseline

---
 src/tests/bench_grab_windows.py | 83 +++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 src/tests/bench_grab_windows.py

diff --git a/src/tests/bench_grab_windows.py b/src/tests/bench_grab_windows.py
new file mode 100644
index 0000000..bc8f619
--- /dev/null
+++ b/src/tests/bench_grab_windows.py
@@ -0,0 +1,83 @@
+"""Quick benchmark for Windows grab performance.
+
+This script is designed to measure the performance of the Windows grab
+implementation, particularly useful for comparing GetDIBits vs CreateDIBSection
+approaches.
+
+Run with: python -m tests.bench_grab_windows
+"""
+
+from __future__ import annotations
+
+import sys
+from time import perf_counter
+
+import mss
+
+ITERATIONS = 500
+WARMUP_ITERATIONS = 10
+
+
+def benchmark_grab() -> None:
+    """Benchmark the grab operation on the primary monitor."""
+    with mss.mss() as sct:
+        monitor = sct.monitors[1]  # Primary monitor
+        width, height = monitor["width"], monitor["height"]
+
+        print(f"Platform: {sys.platform}")
+        print(f"Region: {width}x{height}")
+        print(f"Iterations: {ITERATIONS}")
+        print()
+
+        # Warmup - let any JIT/caching settle
+        for _ in range(WARMUP_ITERATIONS):
+            sct.grab(monitor)
+
+        # Benchmark
+        start = perf_counter()
+        for _ in range(ITERATIONS):
+            sct.grab(monitor)
+        elapsed = perf_counter() - start
+
+        avg_ms = elapsed / ITERATIONS * 1000
+        fps = ITERATIONS / elapsed
+
+        print(f"Total time: {elapsed:.3f}s")
+        print(f"Avg per grab: {avg_ms:.2f}ms")
+        print(f"FPS: {fps:.1f}")
+
+
+def benchmark_grab_varying_sizes() -> None:
+    """Benchmark grab at different region sizes to see scaling behavior."""
+    sizes = [
+        (100, 100),
+        (640, 480),
+        (1280, 720),
+        (1920, 1080),
+    ]
+
+    print("\nVarying size benchmark:")
+    print("-" * 50)
+
+    with mss.mss() as sct:
+        for width, height in sizes:
+            monitor = {"top": 0, "left": 0, "width": width, "height": height}
+
+            # Warmup
+            for _ in range(WARMUP_ITERATIONS):
+                sct.grab(monitor)
+
+            # Benchmark
+            start = perf_counter()
+            for _ in range(ITERATIONS):
+                sct.grab(monitor)
+            elapsed = perf_counter() - start
+
+            avg_ms = elapsed / ITERATIONS * 1000
+            fps = ITERATIONS / elapsed
+            print(f"  {width}x{height}: {avg_ms:.2f}ms ({fps:.1f} FPS)")
+
+
+if __name__ == "__main__":
+    benchmark_grab()
+    benchmark_grab_varying_sizes()

From e2db3658f6df0dc198be5163ee2ab5bba407b203 Mon Sep 17 00:00:00 2001
From: Halldor Fannar <hfannar@nvidia.com>
Date: Tue, 20 Jan 2026 20:28:24 +0000
Subject: [PATCH 2/5] Switch over completely to CreateDIBSection

Streamline benchmark and adjust tests
---
 src/mss/windows.py              | 129 +++++++++++++++-----------
 src/tests/bench_grab_windows.py | 156 +++++++++++++++++++++++++++++++-
 src/tests/test_windows.py       |  35 +++----
 3 files changed, 237 insertions(+), 83 deletions(-)

diff --git a/src/mss/windows.py b/src/mss/windows.py
index df0477a..6726862 100644
--- a/src/mss/windows.py
+++ b/src/mss/windows.py
@@ -1,6 +1,7 @@
 """Windows GDI-based backend for MSS.
 
 Uses user32/gdi32 APIs to capture the desktop and enumerate monitors.
+This implementation uses CreateDIBSection for direct memory access to pixel data.
 """
 
 from __future__ import annotations
@@ -12,6 +13,7 @@
     BOOL,
     BYTE,
     DWORD,
+    HANDLE,
     HBITMAP,
     HDC,
     HGDIOBJ,
@@ -88,7 +90,7 @@ def _errcheck(result: BOOL | _Pointer, func: Callable, arguments: tuple) -> tupl
             "error_msg": winerror.strerror,
         }
         if winerror.winerror == 0:
-            # Some functions return NULL/0 on failure without setting last error.  (Example: CreateCompatibleBitmap
+            # Some functions return NULL/0 on failure without setting last error.  (Example: CreateDIBSection
             # with an invalid HDC.)
             msg = f"Windows graphics function failed (no error provided): {func.__name__}"
             raise ScreenShotError(msg, details=details)
@@ -105,12 +107,16 @@ def _errcheck(result: BOOL | _Pointer, func: Callable, arguments: tuple) -> tupl
 CFUNCTIONS: CFunctionsErrChecked = {
     # Syntax: cfunction: (attr, argtypes, restype, errcheck)
     "BitBlt": ("gdi32", [HDC, INT, INT, INT, INT, HDC, INT, INT, DWORD], BOOL, _errcheck),
-    "CreateCompatibleBitmap": ("gdi32", [HDC, INT, INT], HBITMAP, _errcheck),
     "CreateCompatibleDC": ("gdi32", [HDC], HDC, _errcheck),
+    # CreateDIBSection: ppvBits (4th param) receives a pointer to the DIB pixel data.
+    # hSection is NULL and offset is 0 to have the system allocate the memory.
+    "CreateDIBSection": ("gdi32", [HDC, POINTER(BITMAPINFO), UINT, POINTER(LPVOID), HANDLE, DWORD], HBITMAP, _errcheck),
     "DeleteDC": ("gdi32", [HDC], HDC, _errcheck),
     "DeleteObject": ("gdi32", [HGDIOBJ], BOOL, _errcheck),
     "EnumDisplayMonitors": ("user32", [HDC, LPCRECT, MONITORNUMPROC, LPARAM], BOOL, _errcheck),
-    "GetDIBits": ("gdi32", [HDC, HBITMAP, UINT, UINT, LPVOID, POINTER(BITMAPINFO), UINT], INT, _errcheck),
+    # GdiFlush flushes the calling thread's current batch of GDI operations.
+    # This ensures DIB memory is fully updated before reading.
+    "GdiFlush": ("gdi32", [], BOOL, None),
     # While GetSystemMetrics will return 0 if the parameter is invalid, it will also sometimes return 0 if the
     # parameter is valid but the value is actually 0 (e.g., SM_CLEANBOOT on a normal boot).  Thus, we do not attach an
     # errcheck function here.
@@ -126,6 +132,10 @@ def _errcheck(result: BOOL | _Pointer, func: Callable, arguments: tuple) -> tupl
 class MSS(MSSBase):
     """Multiple ScreenShots implementation for Microsoft Windows.
 
+    This implementation uses CreateDIBSection for direct memory access to pixel data,
+    which eliminates the need for GetDIBits. The DIB pixel data is written directly
+    to system-managed memory that we can read from.
+
     This has no Windows-specific constructor parameters.
 
     .. seealso::
@@ -134,7 +144,17 @@ class MSS(MSSBase):
             Lists constructor parameters.
     """
 
-    __slots__ = {"_bmi", "_bmp", "_data", "_memdc", "_region_width_height", "_srcdc", "gdi32", "user32"}
+    __slots__ = {
+        "_bmi",
+        "_dib",
+        "_dib_array",
+        "_dib_bits",
+        "_memdc",
+        "_region_width_height",
+        "_srcdc",
+        "gdi32",
+        "user32",
+    }
 
     def __init__(self, /, **kwargs: Any) -> None:
         super().__init__(**kwargs)
@@ -147,29 +167,30 @@ def __init__(self, /, **kwargs: Any) -> None:
 
         # Available instance-specific variables
         self._region_width_height: tuple[int, int] | None = None
-        self._bmp: HBITMAP | None = None
+        self._dib: HBITMAP | None = None
+        self._dib_bits: LPVOID = LPVOID()  # Pointer to DIB pixel data
+        self._dib_array: ctypes.Array[ctypes.c_char] | None = None  # Cached array view of DIB memory
         self._srcdc = self.user32.GetWindowDC(0)
         self._memdc = self.gdi32.CreateCompatibleDC(self._srcdc)
-        self._data: ctypes.Array[ctypes.c_char] | None = None
 
         bmi = BITMAPINFO()
         bmi.bmiHeader.biSize = ctypes.sizeof(BITMAPINFOHEADER)
         # biWidth and biHeight are set in _grab_impl().
         bmi.bmiHeader.biPlanes = 1  # Always 1
-        bmi.bmiHeader.biBitCount = 32  # See grab.__doc__ [2]
+        bmi.bmiHeader.biBitCount = 32  # 32-bit RGBX
         bmi.bmiHeader.biCompression = 0  # 0 = BI_RGB (no compression)
         bmi.bmiHeader.biSizeImage = 0  # Windows infers the size
         bmi.bmiHeader.biXPelsPerMeter = 0  # Unspecified
         bmi.bmiHeader.biYPelsPerMeter = 0  # Unspecified
-        bmi.bmiHeader.biClrUsed = 0  # See grab.__doc__ [3]
-        bmi.bmiHeader.biClrImportant = 0  # See grab.__doc__ [3]
+        bmi.bmiHeader.biClrUsed = 0
+        bmi.bmiHeader.biClrImportant = 0
         self._bmi = bmi
 
     def _close_impl(self) -> None:
         # Clean-up
-        if self._bmp:
-            self.gdi32.DeleteObject(self._bmp)
-            self._bmp = None
+        if self._dib:
+            self.gdi32.DeleteObject(self._dib)
+            self._dib = None
 
         if self._memdc:
             self.gdi32.DeleteDC(self._memdc)
@@ -239,34 +260,17 @@ def callback(_monitor: HMONITOR, _data: HDC, rect: LPRECT, _dc: LPARAM) -> bool:
         user32.EnumDisplayMonitors(0, None, callback, 0)
 
     def _grab_impl(self, monitor: Monitor, /) -> ScreenShot:
-        """Retrieve all pixels from a monitor. Pixels have to be RGB.
-
-        In the code, there are a few interesting things:
-
-        [1] bmi.bmiHeader.biHeight = -height
-
-        A bottom-up DIB is specified by setting the height to a
-        positive number, while a top-down DIB is specified by
-        setting the height to a negative number.
-        https://msdn.microsoft.com/en-us/library/ms787796.aspx
-        https://msdn.microsoft.com/en-us/library/dd144879%28v=vs.85%29.aspx
-
-
-        [2] bmi.bmiHeader.biBitCount = 32
-            image_data = create_string_buffer(height * width * 4)
+        """Retrieve all pixels from a monitor using CreateDIBSection.
 
-        We grab the image in RGBX mode, so that each word is 32bit
-        and we have no striding.
-        Inspired by https://github.com/zoofIO/flexx
+        CreateDIBSection creates a DIB with system-managed memory backing,
+        allowing BitBlt to write directly to memory we can read. This eliminates
+        the need for a separate GetDIBits call.
 
-
-        [3] bmi.bmiHeader.biClrUsed = 0
-            bmi.bmiHeader.biClrImportant = 0
-
-        When biClrUsed and biClrImportant are set to zero, there
-        is "no" color table, so we can read the pixels of the bitmap
-        retrieved by gdi32.GetDIBits() as a sequence of RGB values.
-        Thanks to http://stackoverflow.com/a/3688682
+        Note on biHeight: A bottom-up DIB is specified by setting the height to a
+        positive number, while a top-down DIB is specified by setting the height
+        to a negative number. We use negative height for top-down orientation.
+        https://learn.microsoft.com/en-us/windows/win32/api/wingdi/ns-wingdi-bitmapinfoheader
+        https://learn.microsoft.com/en-us/windows/win32/api/wingdi/nf-wingdi-createdibsection
         """
         srcdc, memdc = self._srcdc, self._memdc
         gdi = self.gdi32
@@ -275,25 +279,40 @@ def _grab_impl(self, monitor: Monitor, /) -> ScreenShot:
         if self._region_width_height != (width, height):
             self._region_width_height = (width, height)
             self._bmi.bmiHeader.biWidth = width
-            self._bmi.bmiHeader.biHeight = -height  # Why minus? See [1]
-            self._data = ctypes.create_string_buffer(width * height * 4)  # [2]
-            if self._bmp:
-                gdi.DeleteObject(self._bmp)
-                # Set to None to prevent another DeleteObject in case CreateCompatibleBitmap raises an exception.
-                self._bmp = None
-            self._bmp = gdi.CreateCompatibleBitmap(srcdc, width, height)
-            gdi.SelectObject(memdc, self._bmp)
+            self._bmi.bmiHeader.biHeight = -height  # Negative for top-down DIB
+
+            if self._dib:
+                gdi.DeleteObject(self._dib)
+                self._dib = None
+
+            # CreateDIBSection creates the DIB and returns a pointer to the pixel data
+            self._dib_bits = LPVOID()
+            self._dib = gdi.CreateDIBSection(
+                memdc,
+                self._bmi,
+                DIB_RGB_COLORS,
+                ctypes.byref(self._dib_bits),
+                None,  # hSection = NULL (system allocates memory)
+                0,  # offset = 0
+            )
+            gdi.SelectObject(memdc, self._dib)
 
+            # Create a ctypes array type that maps directly to the DIB memory.
+            # This avoids the overhead of ctypes.string_at() creating an intermediate bytes object.
+            size = width * height * 4
+            array_type = ctypes.c_char * size
+            self._dib_array = ctypes.cast(self._dib_bits, POINTER(array_type)).contents
+
+        # BitBlt copies screen content directly into the DIB's memory
         gdi.BitBlt(memdc, 0, 0, width, height, srcdc, monitor["left"], monitor["top"], SRCCOPY | CAPTUREBLT)
-        assert self._data is not None  # noqa: S101 for type checker
-        scanlines_copied = gdi.GetDIBits(memdc, self._bmp, 0, height, self._data, self._bmi, DIB_RGB_COLORS)
-        if scanlines_copied != height:
-            # If the result was 0 (failure), an exception would have been raised by _errcheck.  This is just a sanity
-            # clause.
-            msg = f"gdi32.GetDIBits() failed: only {scanlines_copied} scanlines copied instead of {height}"
-            raise ScreenShotError(msg)
-
-        return self.cls_image(bytearray(self._data), monitor)
+
+        # Flush GDI operations to ensure DIB memory is fully updated before reading.
+        # This ensures the BitBlt has completed before we access the memory.
+        gdi.GdiFlush()
+
+        # Read directly from DIB memory via the cached array view
+        assert self._dib_array is not None  # noqa: S101  for type checker
+        return self.cls_image(bytearray(self._dib_array), monitor)
 
     def _cursor_impl(self) -> ScreenShot | None:
         """Retrieve all cursor data. Pixels have to be RGB."""
diff --git a/src/tests/bench_grab_windows.py b/src/tests/bench_grab_windows.py
index bc8f619..e6bf33e 100644
--- a/src/tests/bench_grab_windows.py
+++ b/src/tests/bench_grab_windows.py
@@ -1,10 +1,11 @@
 """Quick benchmark for Windows grab performance.
 
-This script is designed to measure the performance of the Windows grab
-implementation, particularly useful for comparing GetDIBits vs CreateDIBSection
-approaches.
+This script measures the performance of the Windows GDI grab implementation
+using CreateDIBSection.
 
 Run with: python -m tests.bench_grab_windows
+         python -m tests.bench_grab_windows timing
+         python -m tests.bench_grab_windows raw
 """
 
 from __future__ import annotations
@@ -18,8 +19,11 @@
 WARMUP_ITERATIONS = 10
 
 
-def benchmark_grab() -> None:
-    """Benchmark the grab operation on the primary monitor."""
+def benchmark_grab() -> tuple[float, float]:
+    """Benchmark the grab operation on the primary monitor.
+
+    Returns (avg_ms, fps) for comparison.
+    """
     with mss.mss() as sct:
         monitor = sct.monitors[1]  # Primary monitor
         width, height = monitor["width"], monitor["height"]
@@ -46,6 +50,8 @@ def benchmark_grab() -> None:
         print(f"Avg per grab: {avg_ms:.2f}ms")
         print(f"FPS: {fps:.1f}")
 
+        return avg_ms, fps
+
 
 def benchmark_grab_varying_sizes() -> None:
     """Benchmark grab at different region sizes to see scaling behavior."""
@@ -78,6 +84,146 @@ def benchmark_grab_varying_sizes() -> None:
             print(f"  {width}x{height}: {avg_ms:.2f}ms ({fps:.1f} FPS)")
 
 
+def benchmark_raw_bitblt() -> None:
+    """Benchmark raw BitBlt to isolate GDI performance from Python overhead."""
+    if sys.platform != "win32":
+        print("Raw BitBlt benchmark is only available on Windows.")
+        return
+
+    import ctypes  # noqa: PLC0415
+    from ctypes.wintypes import BOOL, DWORD, HDC, INT  # noqa: PLC0415
+
+    import mss.windows  # noqa: PLC0415
+
+    gdi32 = ctypes.WinDLL("gdi32", use_last_error=True)
+
+    # Get function references (names match Windows API)
+    bitblt = gdi32.BitBlt
+    bitblt.argtypes = [HDC, INT, INT, INT, INT, HDC, INT, INT, DWORD]
+    bitblt.restype = BOOL
+
+    gdiflush = gdi32.GdiFlush
+    gdiflush.argtypes = []
+    gdiflush.restype = BOOL
+
+    srccopy = 0x00CC0020
+    captureblt = 0x40000000
+
+    with mss.mss() as sct:
+        assert isinstance(sct, mss.windows.MSS)
+        monitor = sct.monitors[1]
+        width, height = monitor["width"], monitor["height"]
+        left, top = monitor["left"], monitor["top"]
+
+        # Force region setup
+        sct.grab(monitor)
+
+        srcdc = sct._srcdc
+        memdc = sct._memdc
+
+        print(f"Raw BitBlt benchmark ({width}x{height})")
+        print("=" * 50)
+
+        # Test with CAPTUREBLT
+        start = perf_counter()
+        for _ in range(ITERATIONS):
+            bitblt(memdc, 0, 0, width, height, srcdc, left, top, srccopy | captureblt)
+            gdiflush()
+        elapsed = perf_counter() - start
+        print(f"With CAPTUREBLT:    {elapsed/ITERATIONS*1000:.2f}ms ({ITERATIONS/elapsed:.1f} FPS)")
+
+        # Test without CAPTUREBLT
+        start = perf_counter()
+        for _ in range(ITERATIONS):
+            bitblt(memdc, 0, 0, width, height, srcdc, left, top, srccopy)
+            gdiflush()
+        elapsed = perf_counter() - start
+        print(f"Without CAPTUREBLT: {elapsed/ITERATIONS*1000:.2f}ms ({ITERATIONS/elapsed:.1f} FPS)")
+
+
+def analyze_frame_timing() -> None:
+    """Analyze individual frame timing to detect VSync/DWM patterns."""
+    num_samples = 200
+
+    with mss.mss() as sct:
+        monitor = sct.monitors[1]
+        width, height = monitor["width"], monitor["height"]
+
+        print("Frame timing analysis")
+        print(f"Region: {width}x{height}")
+        print(f"Samples: {num_samples}")
+        print("=" * 50)
+
+        # Warmup
+        for _ in range(WARMUP_ITERATIONS):
+            sct.grab(monitor)
+
+        # Collect individual frame times
+        times: list[float] = []
+        prev = perf_counter()
+        for _ in range(num_samples):
+            sct.grab(monitor)
+            now = perf_counter()
+            times.append((now - prev) * 1000)  # Convert to ms
+            prev = now
+
+        # Analyze the distribution
+        times.sort()
+        min_t = times[0]
+        max_t = times[-1]
+        avg_t = sum(times) / len(times)
+        median_t = times[len(times) // 2]
+
+        # Calculate percentiles
+        p5 = times[int(len(times) * 0.05)]
+        p95 = times[int(len(times) * 0.95)]
+
+        print("\nTiming distribution:")
+        print(f"  Min:    {min_t:.2f}ms")
+        print(f"  5th %:  {p5:.2f}ms")
+        print(f"  Median: {median_t:.2f}ms")
+        print(f"  Avg:    {avg_t:.2f}ms")
+        print(f"  95th %: {p95:.2f}ms")
+        print(f"  Max:    {max_t:.2f}ms")
+
+        # Check for VSync patterns
+        print("\nVSync pattern analysis:")
+        print("  60 Hz (16.67ms): ", end="")
+        near_60hz = sum(1 for t in times if 15 < t < 18)
+        print(f"{near_60hz}/{num_samples} samples ({near_60hz/num_samples*100:.0f}%)")
+
+        print("  30 Hz (33.33ms): ", end="")
+        near_30hz = sum(1 for t in times if 31 < t < 36)
+        print(f"{near_30hz}/{num_samples} samples ({near_30hz/num_samples*100:.0f}%)")
+
+        print("  < 10ms (fast):   ", end="")
+        fast = sum(1 for t in times if t < 10)
+        print(f"{fast}/{num_samples} samples ({fast/num_samples*100:.0f}%)")
+
+        # Histogram buckets
+        print("\nHistogram (ms):")
+        buckets = [0, 5, 10, 15, 20, 25, 30, 35, 40, 50, 100]
+        for i in range(len(buckets) - 1):
+            lo, hi = buckets[i], buckets[i + 1]
+            count = sum(1 for t in times if lo <= t < hi)
+            bar = "#" * (count * 40 // num_samples)
+            print(f"  {lo:3d}-{hi:3d}: {bar} ({count})")
+        # Overflow bucket
+        count = sum(1 for t in times if t >= buckets[-1])
+        if count > 0:
+            bar = "#" * (count * 40 // num_samples)
+            print(f"  {buckets[-1]:3d}+  : {bar} ({count})")
+
+
 if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        arg = sys.argv[1].lower()
+        if arg == "raw":
+            benchmark_raw_bitblt()
+            sys.exit(0)
+        if arg == "timing":
+            analyze_frame_timing()
+            sys.exit(0)
+
     benchmark_grab()
     benchmark_grab_varying_sizes()
diff --git a/src/tests/test_windows.py b/src/tests/test_windows.py
index fca8a5e..b07c0e0 100644
--- a/src/tests/test_windows.py
+++ b/src/tests/test_windows.py
@@ -9,7 +9,6 @@
 import pytest
 
 import mss
-from mss.exception import ScreenShotError
 
 try:
     import mss.windows
@@ -17,16 +16,6 @@
     pytestmark = pytest.mark.skip
 
 
-def test_implementation(monkeypatch: pytest.MonkeyPatch) -> None:
-    # Test bad data retrieval
-    with mss.mss() as sct:
-        assert isinstance(sct, mss.windows.MSS)  # For Mypy
-
-        monkeypatch.setattr(sct.gdi32, "GetDIBits", lambda *_: 0)
-        with pytest.raises(ScreenShotError):
-            sct.shot()
-
-
 def test_region_caching() -> None:
     """The region to grab is cached, ensure this is well-done."""
     with mss.mss() as sct:
@@ -35,18 +24,18 @@ def test_region_caching() -> None:
         # Grab the area 1
         region1 = {"top": 0, "left": 0, "width": 200, "height": 200}
         sct.grab(region1)
-        bmp1 = id(sct._bmp)
+        dib1 = id(sct._dib)
 
-        # Grab the area 2, the cached BMP is used
+        # Grab the area 2, the cached DIB is used
         # Same sizes but different positions
         region2 = {"top": 200, "left": 200, "width": 200, "height": 200}
         sct.grab(region2)
-        bmp2 = id(sct._bmp)
-        assert bmp1 == bmp2
+        dib2 = id(sct._dib)
+        assert dib1 == dib2
 
-        # Grab the area 2 again, the cached BMP is used
+        # Grab the area 2 again, the cached DIB is used
         sct.grab(region2)
-        assert bmp2 == id(sct._bmp)
+        assert dib2 == id(sct._dib)
 
 
 def test_region_not_caching() -> None:
@@ -60,15 +49,15 @@ def test_region_not_caching() -> None:
     region1 = {"top": 0, "left": 0, "width": 100, "height": 100}
     region2 = {"top": 0, "left": 0, "width": 50, "height": 1}
     grab1.grab(region1)
-    bmp1 = id(grab1._bmp)
+    dib1 = id(grab1._dib)
     grab2.grab(region2)
-    bmp2 = id(grab2._bmp)
-    assert bmp1 != bmp2
+    dib2 = id(grab2._dib)
+    assert dib1 != dib2
 
-    # Grab the area 1, is not bad cached BMP previous grab the area 2
+    # Grab the area 1, is not bad cached DIB previous grab the area 2
     grab1.grab(region1)
-    bmp1 = id(grab1._bmp)
-    assert bmp1 != bmp2
+    dib1 = id(grab1._dib)
+    assert dib1 != dib2
 
 
 def run_child_thread(loops: int) -> None:

From 291ebbc8618104a330e747bd3f995b40b93a73ea Mon Sep 17 00:00:00 2001
From: Halldor Fannar <hfannar@nvidia.com>
Date: Wed, 21 Jan 2026 12:14:56 +0000
Subject: [PATCH 3/5] Formatting fixes to comply with ruff

---
 src/tests/bench_grab_windows.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/tests/bench_grab_windows.py b/src/tests/bench_grab_windows.py
index e6bf33e..844f27f 100644
--- a/src/tests/bench_grab_windows.py
+++ b/src/tests/bench_grab_windows.py
@@ -130,7 +130,7 @@ def benchmark_raw_bitblt() -> None:
             bitblt(memdc, 0, 0, width, height, srcdc, left, top, srccopy | captureblt)
             gdiflush()
         elapsed = perf_counter() - start
-        print(f"With CAPTUREBLT:    {elapsed/ITERATIONS*1000:.2f}ms ({ITERATIONS/elapsed:.1f} FPS)")
+        print(f"With CAPTUREBLT:    {elapsed / ITERATIONS * 1000:.2f}ms ({ITERATIONS / elapsed:.1f} FPS)")
 
         # Test without CAPTUREBLT
         start = perf_counter()
@@ -138,7 +138,7 @@ def benchmark_raw_bitblt() -> None:
             bitblt(memdc, 0, 0, width, height, srcdc, left, top, srccopy)
             gdiflush()
         elapsed = perf_counter() - start
-        print(f"Without CAPTUREBLT: {elapsed/ITERATIONS*1000:.2f}ms ({ITERATIONS/elapsed:.1f} FPS)")
+        print(f"Without CAPTUREBLT: {elapsed / ITERATIONS * 1000:.2f}ms ({ITERATIONS / elapsed:.1f} FPS)")
 
 
 def analyze_frame_timing() -> None:
@@ -190,15 +190,15 @@ def analyze_frame_timing() -> None:
         print("\nVSync pattern analysis:")
         print("  60 Hz (16.67ms): ", end="")
         near_60hz = sum(1 for t in times if 15 < t < 18)
-        print(f"{near_60hz}/{num_samples} samples ({near_60hz/num_samples*100:.0f}%)")
+        print(f"{near_60hz}/{num_samples} samples ({near_60hz / num_samples * 100:.0f}%)")
 
         print("  30 Hz (33.33ms): ", end="")
         near_30hz = sum(1 for t in times if 31 < t < 36)
-        print(f"{near_30hz}/{num_samples} samples ({near_30hz/num_samples*100:.0f}%)")
+        print(f"{near_30hz}/{num_samples} samples ({near_30hz / num_samples * 100:.0f}%)")
 
         print("  < 10ms (fast):   ", end="")
         fast = sum(1 for t in times if t < 10)
-        print(f"{fast}/{num_samples} samples ({fast/num_samples*100:.0f}%)")
+        print(f"{fast}/{num_samples} samples ({fast / num_samples * 100:.0f}%)")
 
         # Histogram buckets
         print("\nHistogram (ms):")

From 1da06c138bb544ed07b51ebdfa087031d22c37a4 Mon Sep 17 00:00:00 2001
From: Halldor Fannar <hfannar@nvidia.com>
Date: Wed, 21 Jan 2026 12:40:24 +0000
Subject: [PATCH 4/5] Update CHANGELOG.md

---
 CHANGELOG.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5087f45..eca17ff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,10 @@
 # History
 
-See Git checking messages for full history.
+See Git commit messages for full history.
 
-## 10.2.0.dev0 (2025-xx-xx)
+## 10.2.0.dev0 (2026-xx-xx)
+- Windows: switch from `GetDIBits` to more memory efficient `CreateDIBSection` for `MSS.grab` implementation (#449)
+- Windows: fix gdi32.GetDIBits() failed after a couple of minutes of recording (#268)
 - Linux: check the server for Xrandr support version (#417)
 - Linux: improve typing and error messages for X libraries (#418)
 - Linux: introduce an XCB-powered backend stack with a factory in ``mss.linux`` while keeping the Xlib code as a fallback (#425)
@@ -10,7 +12,7 @@ See Git checking messages for full history.
 - Windows: improve error checking and messages for Win32 API calls (#448)
 - Mac: fix memory leak (#450, #453)
 - improve multithreading: allow multiple threads to use the same MSS object, allow multiple MSS objects to concurrently take screenshots, and document multithreading guarantees (#446, #452)
-- :heart: contributors: @jholveck
+- :heart: contributors: @jholveck, @halldorfannar
 
 ## 10.1.0 (2025-08-16)
 - Mac: up to 60% performances improvement by taking screenshots at nominal resolution (e.g. scaling is off by default). To enable back scaling, set `mss.darwin.IMAGE_OPTIONS = 0`. (#257)

From b7ef0c664ba6575c1c04956453657410195a2f38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Schoentgen?= <contact@tiger-222.fr>
Date: Wed, 21 Jan 2026 14:14:52 +0100
Subject: [PATCH 5/5] Add bench_grab_windows.py to test setup

---
 src/tests/test_setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tests/test_setup.py b/src/tests/test_setup.py
index 295dba2..940a4f5 100644
--- a/src/tests/test_setup.py
+++ b/src/tests/test_setup.py
@@ -85,6 +85,7 @@ def test_sdist() -> None:
         f"mss-{__version__}/src/tests/__init__.py",
         f"mss-{__version__}/src/tests/bench_bgra2rgb.py",
         f"mss-{__version__}/src/tests/bench_general.py",
+        f"mss-{__version__}/src/tests/bench_grab_windows.py",
         f"mss-{__version__}/src/tests/conftest.py",
         f"mss-{__version__}/src/tests/res/monitor-1024x768.raw.zip",
         f"mss-{__version__}/src/tests/test_bgra_to_rgb.py",