perf: Eliminate per-pixel overheads in image rendering (#1293)

## Summary

Replace per-pixel getRenderMode() + rotateCoordinates() + bounds checks
with a DirectPixelWriter struct that pre-computes orientation and render
mode state once per row. Use bitwise ops instead of division/modulo for
cache pixel packing. Skip PNG cache allocation when buffer exceeds 48KB
(framebuffer size) since PNG decode is fast enough that caching provides
minimal benefit, and the large buffer competes with the 44KB PNG decoder
for heap.

## Additional Context
Measured improvements on ESP32-C3 @ 160MHz:
- JPEG decode: 5-7% faster (1:1 scale)
- PNG decode: 15-20% faster (1:1 scale)
- Cache renders: 3-6% faster across both formats
- Eliminates "Failed to allocate cache buffer" errors for large PNGs

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**<  PARTIALLY >**_
This commit is contained in:
martin brook
2026-03-30 17:03:49 +01:00
committed by GitHub
parent 63961625a2
commit 1df543d48d
5 changed files with 215 additions and 32 deletions

View File

@@ -4,7 +4,7 @@
#include <Logging.h> #include <Logging.h>
#include <Serialization.h> #include <Serialization.h>
#include "../converters/DitherUtils.h" #include "../converters/DirectPixelWriter.h"
#include "../converters/ImageDecoderFactory.h" #include "../converters/ImageDecoderFactory.h"
// Cache file format: // Cache file format:
@@ -66,6 +66,9 @@ bool renderFromCache(GfxRenderer& renderer, const std::string& cachePath, int x,
return false; return false;
} }
DirectPixelWriter pw;
pw.init(renderer);
for (int row = 0; row < cachedHeight; row++) { for (int row = 0; row < cachedHeight; row++) {
if (cacheFile.read(rowBuffer, bytesPerRow) != bytesPerRow) { if (cacheFile.read(rowBuffer, bytesPerRow) != bytesPerRow) {
LOG_ERR("IMG", "Cache read error at row %d", row); LOG_ERR("IMG", "Cache read error at row %d", row);
@@ -74,13 +77,14 @@ bool renderFromCache(GfxRenderer& renderer, const std::string& cachePath, int x,
return false; return false;
} }
int destY = y + row; const int destY = y + row;
pw.beginRow(destY);
for (int col = 0; col < cachedWidth; col++) { for (int col = 0; col < cachedWidth; col++) {
int byteIdx = col / 4; const int byteIdx = col >> 2; // col / 4
int bitShift = 6 - (col % 4) * 2; // MSB first within byte const int bitShift = 6 - (col & 3) * 2; // MSB first within byte
uint8_t pixelValue = (rowBuffer[byteIdx] >> bitShift) & 0x03; uint8_t pixelValue = (rowBuffer[byteIdx] >> bitShift) & 0x03;
drawPixelWithRenderMode(renderer, x + col, destY, pixelValue); pw.writePixel(x + col, pixelValue);
} }
} }

View File

@@ -0,0 +1,156 @@
#pragma once
#include <GfxRenderer.h>
#include <HalDisplay.h>
#include <stdint.h>
// Direct framebuffer writer that eliminates per-pixel overhead from the image
// rendering hot path. Pre-computes orientation transform as linear coefficients
// and caches render-mode state so the inner loop is: one multiply, one add,
// one shift, and one AND per pixel — no branches, no method calls.
//
// Caller is responsible for ensuring (outX, outY) are within screen bounds.
// ImageBlock::render() already validates this before entering the pixel loop,
// and the JPEG/PNG callbacks pre-clamp destination ranges to screen bounds.
struct DirectPixelWriter {
uint8_t* fb;
GfxRenderer::RenderMode mode;
// Orientation is collapsed into a linear transform:
// phyX = phyXBase + x * phyXStepX + y * phyXStepY
// phyY = phyYBase + x * phyYStepX + y * phyYStepY
int phyXBase, phyYBase;
int phyXStepX, phyYStepX; // per logical-X step
int phyXStepY, phyYStepY; // per logical-Y step
// Row-precomputed: the Y-dependent portion of the physical coords
int rowPhyXBase, rowPhyYBase;
void init(GfxRenderer& renderer) {
fb = renderer.getFrameBuffer();
mode = renderer.getRenderMode();
switch (renderer.getOrientation()) {
case GfxRenderer::Portrait:
// phyX = y, phyY = (DISPLAY_HEIGHT-1) - x
phyXBase = 0;
phyYBase = HalDisplay::DISPLAY_HEIGHT - 1;
phyXStepX = 0;
phyYStepX = -1;
phyXStepY = 1;
phyYStepY = 0;
break;
case GfxRenderer::LandscapeClockwise:
// phyX = (DISPLAY_WIDTH-1) - x, phyY = (DISPLAY_HEIGHT-1) - y
phyXBase = HalDisplay::DISPLAY_WIDTH - 1;
phyYBase = HalDisplay::DISPLAY_HEIGHT - 1;
phyXStepX = -1;
phyYStepX = 0;
phyXStepY = 0;
phyYStepY = -1;
break;
case GfxRenderer::PortraitInverted:
// phyX = (DISPLAY_WIDTH-1) - y, phyY = x
phyXBase = HalDisplay::DISPLAY_WIDTH - 1;
phyYBase = 0;
phyXStepX = 0;
phyYStepX = 1;
phyXStepY = -1;
phyYStepY = 0;
break;
case GfxRenderer::LandscapeCounterClockwise:
// phyX = x, phyY = y
phyXBase = 0;
phyYBase = 0;
phyXStepX = 1;
phyYStepX = 0;
phyXStepY = 0;
phyYStepY = 1;
break;
default:
// Fallback to LandscapeCounterClockwise (identity transform)
phyXBase = 0;
phyYBase = 0;
phyXStepX = 1;
phyYStepX = 0;
phyXStepY = 0;
phyYStepY = 1;
break;
}
}
// Call once per row before the column loop.
// Pre-computes the Y-dependent portion so writePixel() only needs the X part.
inline void beginRow(int logicalY) {
rowPhyXBase = phyXBase + logicalY * phyXStepY;
rowPhyYBase = phyYBase + logicalY * phyYStepY;
}
// Write a single 2-bit dithered pixel value to the framebuffer.
// Must be called after beginRow() for the current row.
// No bounds checking — caller guarantees coordinates are valid.
inline void writePixel(int logicalX, uint8_t pixelValue) const {
// Determine whether to draw based on render mode
bool draw;
bool state;
switch (mode) {
case GfxRenderer::BW:
draw = (pixelValue < 3);
state = true;
break;
case GfxRenderer::GRAYSCALE_MSB:
draw = (pixelValue == 1 || pixelValue == 2);
state = false;
break;
case GfxRenderer::GRAYSCALE_LSB:
draw = (pixelValue == 1);
state = false;
break;
default:
return;
}
if (!draw) return;
const int phyX = rowPhyXBase + logicalX * phyXStepX;
const int phyY = rowPhyYBase + logicalX * phyYStepX;
const uint16_t byteIndex = phyY * HalDisplay::DISPLAY_WIDTH_BYTES + (phyX >> 3);
const uint8_t bitMask = 1 << (7 - (phyX & 7));
if (state) {
fb[byteIndex] &= ~bitMask; // Clear bit (draw black)
} else {
fb[byteIndex] |= bitMask; // Set bit (draw white)
}
}
};
// Direct cache writer that eliminates per-pixel overhead from PixelCache::setPixel().
// Pre-computes row pointer so the inner loop is just byte index + bit manipulation.
//
// Caller guarantees coordinates are within cache bounds.
struct DirectCacheWriter {
uint8_t* buffer;
int bytesPerRow;
int originX;
uint8_t* rowPtr; // Pre-computed for current row
void init(uint8_t* cacheBuffer, int cacheBytesPerRow, int cacheOriginX) {
buffer = cacheBuffer;
bytesPerRow = cacheBytesPerRow;
originX = cacheOriginX;
rowPtr = nullptr;
}
// Call once per row before the column loop.
inline void beginRow(int screenY, int cacheOriginY) { rowPtr = buffer + (screenY - cacheOriginY) * bytesPerRow; }
// Write a 2-bit pixel value. No bounds checking.
inline void writePixel(int screenX, uint8_t value) const {
const int localX = screenX - originX;
const int byteIdx = localX >> 2; // localX / 4
const int bitShift = 6 - (localX & 3) * 2; // MSB first: pixel 0 at bits 6-7
rowPtr[byteIdx] = (rowPtr[byteIdx] & ~(0x03 << bitShift)) | ((value & 0x03) << bitShift);
}
};

View File

@@ -1,6 +1,5 @@
#pragma once #pragma once
#include <GfxRenderer.h>
#include <stdint.h> #include <stdint.h>
// 4x4 Bayer matrix for ordered dithering // 4x4 Bayer matrix for ordered dithering
@@ -26,15 +25,3 @@ inline uint8_t applyBayerDither4Level(uint8_t gray, int x, int y) {
if (adjusted < 192) return 2; if (adjusted < 192) return 2;
return 3; return 3;
} }
// Draw a pixel respecting the current render mode for grayscale support
inline void drawPixelWithRenderMode(GfxRenderer& renderer, int x, int y, uint8_t pixelValue) {
GfxRenderer::RenderMode renderMode = renderer.getRenderMode();
if (renderMode == GfxRenderer::BW && pixelValue < 3) {
renderer.drawPixel(x, y, true);
} else if (renderMode == GfxRenderer::GRAYSCALE_MSB && (pixelValue == 1 || pixelValue == 2)) {
renderer.drawPixel(x, y, false);
} else if (renderMode == GfxRenderer::GRAYSCALE_LSB && pixelValue == 1) {
renderer.drawPixel(x, y, false);
}
}

View File

@@ -9,6 +9,7 @@
#include <cstdlib> #include <cstdlib>
#include <new> #include <new>
#include "DirectPixelWriter.h"
#include "DitherUtils.h" #include "DitherUtils.h"
#include "PixelCache.h" #include "PixelCache.h"
@@ -167,10 +168,21 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
if (dstYStart >= dstYEnd || dstXStart >= dstXEnd) return 1; if (dstYStart >= dstYEnd || dstXStart >= dstXEnd) return 1;
// Pre-compute orientation and render-mode state once per callback invocation
DirectPixelWriter pw;
pw.init(renderer);
DirectCacheWriter cw;
if (caching) {
cw.init(ctx->cache.buffer, ctx->cache.bytesPerRow, ctx->cache.originX);
}
// === 1:1 fast path: no scaling math === // === 1:1 fast path: no scaling math ===
if (fineScaleFP == FP_ONE) { if (fineScaleFP == FP_ONE) {
for (int dstY = dstYStart; dstY < dstYEnd; dstY++) { for (int dstY = dstYStart; dstY < dstYEnd; dstY++) {
const int outY = cfgY + dstY; const int outY = cfgY + dstY;
pw.beginRow(outY);
if (caching) cw.beginRow(outY, ctx->config->y);
const uint8_t* row = &pixels[(dstY - blockY) * stride]; const uint8_t* row = &pixels[(dstY - blockY) * stride];
for (int dstX = dstXStart; dstX < dstXEnd; dstX++) { for (int dstX = dstXStart; dstX < dstXEnd; dstX++) {
const int outX = cfgX + dstX; const int outX = cfgX + dstX;
@@ -182,8 +194,8 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
dithered = gray / 85; dithered = gray / 85;
if (dithered > 3) dithered = 3; if (dithered > 3) dithered = 3;
} }
drawPixelWithRenderMode(renderer, outX, outY, dithered); pw.writePixel(outX, dithered);
if (caching) ctx->cache.setPixel(outX, outY, dithered); if (caching) cw.writePixel(outX, dithered);
} }
} }
return 1; return 1;
@@ -203,6 +215,8 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
for (int dstY = dstYStart; dstY < dstYEnd; dstY++) { for (int dstY = dstYStart; dstY < dstYEnd; dstY++) {
const int outY = cfgY + dstY; const int outY = cfgY + dstY;
pw.beginRow(outY);
if (caching) cw.beginRow(outY, ctx->config->y);
const int32_t srcFyFP = dstY * invScaleFP; const int32_t srcFyFP = dstY * invScaleFP;
const int32_t fy = srcFyFP & FP_MASK; const int32_t fy = srcFyFP & FP_MASK;
const int32_t fyInv = FP_ONE - fy; const int32_t fyInv = FP_ONE - fy;
@@ -239,8 +253,8 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
dithered = gray / 85; dithered = gray / 85;
if (dithered > 3) dithered = 3; if (dithered > 3) dithered = 3;
} }
drawPixelWithRenderMode(renderer, outX, outY, dithered); pw.writePixel(outX, dithered);
if (caching) ctx->cache.setPixel(outX, outY, dithered); if (caching) cw.writePixel(outX, dithered);
} }
// Interior (no X boundary checks — lx0 and lx0+1 guaranteed in bounds) // Interior (no X boundary checks — lx0 and lx0+1 guaranteed in bounds)
@@ -262,8 +276,8 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
dithered = gray / 85; dithered = gray / 85;
if (dithered > 3) dithered = 3; if (dithered > 3) dithered = 3;
} }
drawPixelWithRenderMode(renderer, outX, outY, dithered); pw.writePixel(outX, dithered);
if (caching) ctx->cache.setPixel(outX, outY, dithered); if (caching) cw.writePixel(outX, dithered);
} }
// Right edge (with X boundary clamping) // Right edge (with X boundary clamping)
@@ -288,8 +302,8 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
dithered = gray / 85; dithered = gray / 85;
if (dithered > 3) dithered = 3; if (dithered > 3) dithered = 3;
} }
drawPixelWithRenderMode(renderer, outX, outY, dithered); pw.writePixel(outX, dithered);
if (caching) ctx->cache.setPixel(outX, outY, dithered); if (caching) cw.writePixel(outX, dithered);
} }
} }
return 1; return 1;
@@ -298,6 +312,8 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
// === Nearest-neighbor (downscale: fineScale < 1.0) === // === Nearest-neighbor (downscale: fineScale < 1.0) ===
for (int dstY = dstYStart; dstY < dstYEnd; dstY++) { for (int dstY = dstYStart; dstY < dstYEnd; dstY++) {
const int outY = cfgY + dstY; const int outY = cfgY + dstY;
pw.beginRow(outY);
if (caching) cw.beginRow(outY, ctx->config->y);
const int32_t srcFyFP = dstY * invScaleFP; const int32_t srcFyFP = dstY * invScaleFP;
int ly = (srcFyFP >> FP_SHIFT) - blockY; int ly = (srcFyFP >> FP_SHIFT) - blockY;
if (ly < 0) ly = 0; if (ly < 0) ly = 0;
@@ -319,8 +335,8 @@ int jpegDrawCallback(JPEGDRAW* pDraw) {
dithered = gray / 85; dithered = gray / 85;
if (dithered > 3) dithered = 3; if (dithered > 3) dithered = 3;
} }
drawPixelWithRenderMode(renderer, outX, outY, dithered); pw.writePixel(outX, dithered);
if (caching) ctx->cache.setPixel(outX, outY, dithered); if (caching) cw.writePixel(outX, dithered);
} }
} }

View File

@@ -9,6 +9,7 @@
#include <cstdlib> #include <cstdlib>
#include <new> #include <new>
#include "DirectPixelWriter.h"
#include "DitherUtils.h" #include "DitherUtils.h"
#include "PixelCache.h" #include "PixelCache.h"
@@ -207,6 +208,17 @@ int pngDrawCallback(PNGDRAW* pDraw) {
bool useDithering = ctx->config->useDithering; bool useDithering = ctx->config->useDithering;
bool caching = ctx->caching; bool caching = ctx->caching;
// Pre-compute orientation and render-mode state once per row
DirectPixelWriter pw;
pw.init(*ctx->renderer);
pw.beginRow(outY);
DirectCacheWriter cw;
if (caching) {
cw.init(ctx->cache.buffer, ctx->cache.bytesPerRow, ctx->cache.originX);
cw.beginRow(outY, ctx->config->y);
}
int srcX = 0; int srcX = 0;
int error = 0; int error = 0;
@@ -222,8 +234,8 @@ int pngDrawCallback(PNGDRAW* pDraw) {
ditheredGray = gray / 85; ditheredGray = gray / 85;
if (ditheredGray > 3) ditheredGray = 3; if (ditheredGray > 3) ditheredGray = 3;
} }
drawPixelWithRenderMode(*ctx->renderer, outX, outY, ditheredGray); pw.writePixel(outX, ditheredGray);
if (caching) ctx->cache.setPixel(outX, outY, ditheredGray); if (caching) cw.writePixel(outX, ditheredGray);
} }
// Bresenham-style stepping: advance srcX based on ratio srcWidth/dstWidth // Bresenham-style stepping: advance srcX based on ratio srcWidth/dstWidth
@@ -356,10 +368,18 @@ bool PngToFramebufferConverter::decodeToFramebuffer(const std::string& imagePath
return false; return false;
} }
// Allocate cache buffer using SCALED dimensions // Allocate cache buffer using SCALED dimensions.
// PNG decode is fast enough (~135ms for 400x600) that caching provides minimal benefit
// for larger images, while the cache buffer competes with the 44KB PNG decoder for heap.
// Skip caching when the buffer would exceed the framebuffer size (48KB).
static constexpr size_t PNG_MAX_CACHE_BYTES = 48000;
ctx.caching = !config.cachePath.empty(); ctx.caching = !config.cachePath.empty();
if (ctx.caching) { if (ctx.caching) {
if (!ctx.cache.allocate(ctx.dstWidth, ctx.dstHeight, config.x, config.y)) { size_t cacheSize = (size_t)((ctx.dstWidth + 3) / 4) * ctx.dstHeight;
if (cacheSize > PNG_MAX_CACHE_BYTES) {
LOG_DBG("PNG", "Skipping cache: %zu bytes exceeds PNG limit (%zu)", cacheSize, PNG_MAX_CACHE_BYTES);
ctx.caching = false;
} else if (!ctx.cache.allocate(ctx.dstWidth, ctx.dstHeight, config.x, config.y)) {
LOG_ERR("PNG", "Failed to allocate cache buffer, continuing without caching"); LOG_ERR("PNG", "Failed to allocate cache buffer, continuing without caching");
ctx.caching = false; ctx.caching = false;
} }