Improve EPUB cover image quality with pre-scaling and Atkinson dithering

Pre-scaling (critical fix): - Add pre-scaling to fit display dimensions (480x800) before dithering to prevent post-downsampling artifacts that destroy dithering patterns - Use fixed-point (16.16) math for sub-pixel accurate scaling - Implement area averaging for smooth downsampling Dithering improvements: - Add Atkinson dithering (75% error diffusion) for cleaner results - Add Floyd-Steinberg dithering option with serpentine scanning - Keep clustered-dot halftone and Bayer as compile-time options Image adjustments: - Add brightness, contrast, and gamma adjustments for better visibility - Adjust RGB to grayscale conversion (25-50-25) to reduce blue darkness Other changes: - Fix MCU block indexing bug for correct picojpeg buffer access - Pre-generate cover BMP when EPUB is loaded for faster sleep screen
2025-12-25 00:30:28 +09:00 · 2025-12-25 00:30:28 +09:00 · 16568932cf
commit 16568932cf
parent c801da2d9a
6 changed files with 716 additions and 83 deletions
--- a/lib/GfxRenderer/Bitmap.cpp
+++ b/lib/GfxRenderer/Bitmap.cpp
@ -3,6 +3,125 @@
 #include <cstdlib>
 #include <cstring>

+// ============================================================================
+// IMAGE PROCESSING OPTIONS - Toggle these to test different configurations
+// ============================================================================
+// Note: For cover images, dithering is done in JpegToBmpConverter.cpp
+// This file handles BMP reading - use simple quantization to avoid double-dithering
+constexpr bool USE_FLOYD_STEINBERG = false;// Disabled - dithering done at JPEG conversion
+constexpr bool USE_NOISE_DITHERING = false;// Hash-based noise dithering
+// Brightness adjustments:
+constexpr bool USE_BRIGHTNESS = false;     // true: apply brightness/gamma adjustments
+constexpr int BRIGHTNESS_BOOST = 20;       // Brightness offset (0-50), only if USE_BRIGHTNESS=true
+constexpr bool GAMMA_CORRECTION = false;   // Gamma curve, only if USE_BRIGHTNESS=true
+// ============================================================================
+
+// Integer approximation of gamma correction (brightens midtones)
+static inline int applyGamma(int gray) {
+  if (!GAMMA_CORRECTION) return gray;
+  const int product = gray * 255;
+  int x = gray;
+  if (x > 0) {
+    x = (x + product / x) >> 1;
+    x = (x + product / x) >> 1;
+  }
+  return x > 255 ? 255 : x;
+}
+
+// Simple quantization without dithering - just divide into 4 levels
+static inline uint8_t quantizeSimple(int gray) {
+  if (USE_BRIGHTNESS) {
+    gray += BRIGHTNESS_BOOST;
+    if (gray > 255) gray = 255;
+    gray = applyGamma(gray);
+  }
+  return static_cast<uint8_t>(gray >> 6);
+}
+
+// Hash-based noise dithering - survives downsampling without moiré artifacts
+static inline uint8_t quantizeNoise(int gray, int x, int y) {
+  if (USE_BRIGHTNESS) {
+    gray += BRIGHTNESS_BOOST;
+    if (gray > 255) gray = 255;
+    gray = applyGamma(gray);
+  }
+
+  uint32_t hash = static_cast<uint32_t>(x) * 374761393u + static_cast<uint32_t>(y) * 668265263u;
+  hash = (hash ^ (hash >> 13)) * 1274126177u;
+  const int threshold = static_cast<int>(hash >> 24);
+
+  const int scaled = gray * 3;
+  if (scaled < 255) {
+    return (scaled + threshold >= 255) ? 1 : 0;
+  } else if (scaled < 510) {
+    return ((scaled - 255) + threshold >= 255) ? 2 : 1;
+  } else {
+    return ((scaled - 510) + threshold >= 255) ? 3 : 2;
+  }
+}
+
+// Main quantization function
+static inline uint8_t quantize(int gray, int x, int y) {
+  if (USE_NOISE_DITHERING) {
+    return quantizeNoise(gray, x, y);
+  } else {
+    return quantizeSimple(gray);
+  }
+}
+
+// Floyd-Steinberg quantization with error diffusion and serpentine scanning
+// Returns 2-bit value (0-3) and updates error buffers
+static inline uint8_t quantizeFloydSteinberg(int gray, int x, int width, int16_t* errorCurRow, int16_t* errorNextRow, bool reverseDir) {
+  // Add accumulated error to this pixel
+  int adjusted = gray + errorCurRow[x + 1];
+
+  // Clamp to valid range
+  if (adjusted < 0) adjusted = 0;
+  if (adjusted > 255) adjusted = 255;
+
+  // Quantize to 4 levels (0, 85, 170, 255)
+  uint8_t quantized;
+  int quantizedValue;
+  if (adjusted < 43) {
+    quantized = 0;
+    quantizedValue = 0;
+  } else if (adjusted < 128) {
+    quantized = 1;
+    quantizedValue = 85;
+  } else if (adjusted < 213) {
+    quantized = 2;
+    quantizedValue = 170;
+  } else {
+    quantized = 3;
+    quantizedValue = 255;
+  }
+
+  // Calculate error
+  int error = adjusted - quantizedValue;
+
+  // Distribute error to neighbors (serpentine: direction-aware)
+  if (!reverseDir) {
+    // Left to right
+    errorCurRow[x + 2] += (error * 7) >> 4;      // Right: 7/16
+    errorNextRow[x] += (error * 3) >> 4;         // Bottom-left: 3/16
+    errorNextRow[x + 1] += (error * 5) >> 4;     // Bottom: 5/16
+    errorNextRow[x + 2] += (error) >> 4;         // Bottom-right: 1/16
+  } else {
+    // Right to left (mirrored)
+    errorCurRow[x] += (error * 7) >> 4;          // Left: 7/16
+    errorNextRow[x + 2] += (error * 3) >> 4;     // Bottom-right: 3/16
+    errorNextRow[x + 1] += (error * 5) >> 4;     // Bottom: 5/16
+    errorNextRow[x] += (error) >> 4;             // Bottom-left: 1/16
+  }
+
+  return quantized;
+}
+
+Bitmap::~Bitmap() {
+  delete[] errorCurRow;
+  delete[] errorNextRow;
+}
+
 uint16_t Bitmap::readLE16(File& f) {
  const int c0 = f.read();
  const int c1 = f.read();
@ -46,6 +165,8 @@ const char* Bitmap::errorToString(BmpReaderError err) {
      return "UnsupportedCompression (expected BI_RGB or BI_BITFIELDS for 32bpp)";
    case BmpReaderError::BadDimensions:
      return "BadDimensions";
+    case BmpReaderError::ImageTooLarge:
+      return "ImageTooLarge (max 2048x3072)";
    case BmpReaderError::PaletteTooLarge:
      return "PaletteTooLarge";

@ -99,6 +220,13 @@ BmpReaderError Bitmap::parseHeaders() {

  if (width <= 0 || height <= 0) return BmpReaderError::BadDimensions;

+  // Safety limits to prevent memory issues on ESP32
+  constexpr int MAX_IMAGE_WIDTH = 2048;
+  constexpr int MAX_IMAGE_HEIGHT = 3072;
+  if (width > MAX_IMAGE_WIDTH || height > MAX_IMAGE_HEIGHT) {
+    return BmpReaderError::ImageTooLarge;
+  }
+
  // Pre-calculate Row Bytes to avoid doing this every row
  rowBytes = (width * bpp + 31) / 32 * 4;

@ -115,21 +243,56 @@ BmpReaderError Bitmap::parseHeaders() {
    return BmpReaderError::SeekPixelDataFailed;
  }

+  // Allocate Floyd-Steinberg error buffers if enabled
+  if (USE_FLOYD_STEINBERG) {
+    delete[] errorCurRow;
+    delete[] errorNextRow;
+    errorCurRow = new int16_t[width + 2]();   // +2 for boundary handling
+    errorNextRow = new int16_t[width + 2]();
+    lastRowY = -1;
+  }
+
  return BmpReaderError::Ok;
 }

 // packed 2bpp output, 0 = black, 1 = dark gray, 2 = light gray, 3 = white
-BmpReaderError Bitmap::readRow(uint8_t* data, uint8_t* rowBuffer) const {
+BmpReaderError Bitmap::readRow(uint8_t* data, uint8_t* rowBuffer, int rowY) const {
  // Note: rowBuffer should be pre-allocated by the caller to size 'rowBytes'
  if (file.read(rowBuffer, rowBytes) != rowBytes) return BmpReaderError::ShortReadRow;

+  // Handle Floyd-Steinberg error buffer progression
+  const bool useFS = USE_FLOYD_STEINBERG && errorCurRow && errorNextRow;
+  if (useFS) {
+    // Check if we need to advance to next row (or reset if jumping)
+    if (rowY != lastRowY + 1 && rowY != 0) {
+      // Non-sequential row access - reset error buffers
+      memset(errorCurRow, 0, (width + 2) * sizeof(int16_t));
+      memset(errorNextRow, 0, (width + 2) * sizeof(int16_t));
+    } else if (rowY > 0) {
+      // Sequential access - swap buffers
+      int16_t* temp = errorCurRow;
+      errorCurRow = errorNextRow;
+      errorNextRow = temp;
+      memset(errorNextRow, 0, (width + 2) * sizeof(int16_t));
+    }
+    lastRowY = rowY;
+  }
+
  uint8_t* outPtr = data;
  uint8_t currentOutByte = 0;
  int bitShift = 6;
+  int currentX = 0;

  // Helper lambda to pack 2bpp color into the output stream
  auto packPixel = [&](const uint8_t lum) {
-    uint8_t color = (lum >> 6);  // Simple 2-bit reduction: 0-255 -> 0-3
+    uint8_t color;
+    if (useFS) {
+      // Floyd-Steinberg error diffusion
+      color = quantizeFloydSteinberg(lum, currentX, width, errorCurRow, errorNextRow, false);
+    } else {
+      // Simple quantization or noise dithering
+      color = quantize(lum, currentX, rowY);
+    }
    currentOutByte |= (color << bitShift);
    if (bitShift == 0) {
      *outPtr++ = currentOutByte;
@ -138,6 +301,7 @@ BmpReaderError Bitmap::readRow(uint8_t* data, uint8_t* rowBuffer) const {
    } else {
      bitShift -= 2;
    }
+    currentX++;
  };

  uint8_t lum;
@ -196,5 +360,12 @@ BmpReaderError Bitmap::rewindToData() const {
    return BmpReaderError::SeekPixelDataFailed;
  }

+  // Reset Floyd-Steinberg error buffers when rewinding
+  if (USE_FLOYD_STEINBERG && errorCurRow && errorNextRow) {
+    memset(errorCurRow, 0, (width + 2) * sizeof(int16_t));
+    memset(errorNextRow, 0, (width + 2) * sizeof(int16_t));
+    lastRowY = -1;
+  }
+
  return BmpReaderError::Ok;
 }
--- a/lib/GfxRenderer/Bitmap.h
+++ b/lib/GfxRenderer/Bitmap.h
@ -15,6 +15,7 @@ enum class BmpReaderError : uint8_t {
  UnsupportedCompression,

  BadDimensions,
+  ImageTooLarge,
  PaletteTooLarge,

  SeekPixelDataFailed,
@ -28,8 +29,9 @@ class Bitmap {
  static const char* errorToString(BmpReaderError err);

  explicit Bitmap(File& file) : file(file) {}
+  ~Bitmap();
  BmpReaderError parseHeaders();
-  BmpReaderError readRow(uint8_t* data, uint8_t* rowBuffer) const;
+  BmpReaderError readRow(uint8_t* data, uint8_t* rowBuffer, int rowY) const;
  BmpReaderError rewindToData() const;
  int getWidth() const { return width; }
  int getHeight() const { return height; }
@ -49,4 +51,9 @@ class Bitmap {
  uint16_t bpp = 0;
  int rowBytes = 0;
  uint8_t paletteLum[256] = {};
+
+  // Floyd-Steinberg dithering state (mutable for const methods)
+  mutable int16_t* errorCurRow = nullptr;
+  mutable int16_t* errorNextRow = nullptr;
+  mutable int lastRowY = -1;  // Track row progression for error propagation
 };
--- a/lib/GfxRenderer/GfxRenderer.cpp
+++ b/lib/GfxRenderer/GfxRenderer.cpp
@ -132,7 +132,9 @@ void GfxRenderer::drawBitmap(const Bitmap& bitmap, const int x, const int y, con
    isScaled = true;
  }

-  const uint8_t outputRowSize = (bitmap.getWidth() + 3) / 4;
+  // Calculate output row size (2 bits per pixel, packed into bytes)
+  // IMPORTANT: Use int, not uint8_t, to avoid overflow for images > 1020 pixels wide
+  const int outputRowSize = (bitmap.getWidth() + 3) / 4;
  auto* outputRow = static_cast<uint8_t*>(malloc(outputRowSize));
  auto* rowBytes = static_cast<uint8_t*>(malloc(bitmap.getRowBytes()));

@ -154,7 +156,7 @@ void GfxRenderer::drawBitmap(const Bitmap& bitmap, const int x, const int y, con
      break;
    }

-    if (bitmap.readRow(outputRow, rowBytes) != BmpReaderError::Ok) {
+    if (bitmap.readRow(outputRow, rowBytes, bmpY) != BmpReaderError::Ok) {
      Serial.printf("[%lu] [GFX] Failed to read row %d from bitmap\n", millis(), bmpY);
      free(outputRow);
      free(rowBytes);
--- a/lib/JpegToBmpConverter/JpegToBmpConverter.cpp
+++ b/lib/JpegToBmpConverter/JpegToBmpConverter.cpp
@ -13,50 +13,296 @@ struct JpegReadContext {
  size_t bufferFilled;
 };

-// 4x4 Bayer ordered dithering matrix (normalized to 0-255 range for 16 levels)
-// This creates a pattern that distributes quantization error spatially
-// Reference: https://surma.dev/things/ditherpunk/
-static const uint8_t bayerMatrix4x4[4][4] = {
-    {0, 128, 32, 160},    //  0/16,  8/16,  2/16, 10/16
-    {192, 64, 224, 96},   // 12/16,  4/16, 14/16,  6/16
-    {48, 176, 16, 144},   //  3/16, 11/16,  1/16,  9/16
-    {240, 112, 208, 80}   // 15/16,  7/16, 13/16,  5/16
-};
+// ============================================================================
+// IMAGE PROCESSING OPTIONS - Toggle these to test different configurations
+// ============================================================================
+constexpr bool USE_8BIT_OUTPUT = false;    // true: 8-bit grayscale (no quantization), false: 2-bit (4 levels)
+// Dithering method selection (only one should be true, or all false for simple quantization):
+constexpr bool USE_ATKINSON = true;        // Atkinson dithering (cleaner than F-S, less error diffusion)
+constexpr bool USE_FLOYD_STEINBERG = false;// Floyd-Steinberg error diffusion (can cause "worm" artifacts)
+constexpr bool USE_NOISE_DITHERING = false;// Hash-based noise dithering (good for downsampling)
+// Brightness/Contrast adjustments:
+constexpr bool USE_BRIGHTNESS = true;      // true: apply brightness/gamma adjustments
+constexpr int BRIGHTNESS_BOOST = 10;       // Brightness offset (0-50)
+constexpr bool GAMMA_CORRECTION = true;    // Gamma curve (brightens midtones)
+constexpr float CONTRAST_FACTOR = 1.15f;   // Contrast multiplier (1.0 = no change, >1 = more contrast)
+// Pre-resize to target display size (CRITICAL: avoids dithering artifacts from post-downsampling)
+constexpr bool USE_PRESCALE = true;        // true: scale image to target size before dithering
+constexpr int TARGET_MAX_WIDTH = 480;      // Max width for cover images (portrait display width)
+constexpr int TARGET_MAX_HEIGHT = 800;     // Max height for cover images (portrait display height)
+// ============================================================================

-// Helper function: Convert 8-bit grayscale to 2-bit (0-3) using ordered dithering
-uint8_t JpegToBmpConverter::grayscaleTo2Bit(const uint8_t grayscale, const int x, const int y) {
-  // Get the threshold from Bayer matrix based on pixel position
-  const uint8_t threshold = bayerMatrix4x4[y & 3][x & 3];
+// Integer approximation of gamma correction (brightens midtones)
+// Uses a simple curve: out = 255 * sqrt(in/255) ≈ sqrt(in * 255)
+static inline int applyGamma(int gray) {
+  if (!GAMMA_CORRECTION) return gray;
+  // Fast integer square root approximation for gamma ~0.5 (brightening)
+  // This brightens dark/mid tones while preserving highlights
+  const int product = gray * 255;
+  // Newton-Raphson integer sqrt (2 iterations for good accuracy)
+  int x = gray;
+  if (x > 0) {
+    x = (x + product / x) >> 1;
+    x = (x + product / x) >> 1;
+  }
+  return x > 255 ? 255 : x;
+}

-  // For 4-level output (2-bit), we need to map grayscale to one of 4 levels
-  // Each level spans ~85 values (255/3 ≈ 85)
-  // We use the Bayer threshold to decide between adjacent levels
+// Apply contrast adjustment around midpoint (128)
+// factor > 1.0 increases contrast, < 1.0 decreases
+static inline int applyContrast(int gray) {
+  // Integer-based contrast: (gray - 128) * factor + 128
+  // Using fixed-point: factor 1.15 ≈ 115/100
+  constexpr int factorNum = static_cast<int>(CONTRAST_FACTOR * 100);
+  int adjusted = ((gray - 128) * factorNum) / 100 + 128;
+  if (adjusted < 0) adjusted = 0;
+  if (adjusted > 255) adjusted = 255;
+  return adjusted;
+}

-  // Scale grayscale to 0-765 range (3 * 255) for finer comparison
-  const int scaled = grayscale * 3;
+// Combined brightness/contrast/gamma adjustment
+static inline int adjustPixel(int gray) {
+  if (!USE_BRIGHTNESS) return gray;
+
+  // Order: contrast first, then brightness, then gamma
+  gray = applyContrast(gray);
+  gray += BRIGHTNESS_BOOST;
+  if (gray > 255) gray = 255;
+  if (gray < 0) gray = 0;
+  gray = applyGamma(gray);
+
+  return gray;
+}
+
+// Simple quantization without dithering - just divide into 4 levels
+static inline uint8_t quantizeSimple(int gray) {
+  gray = adjustPixel(gray);
+  // Simple 2-bit quantization: 0-63=0, 64-127=1, 128-191=2, 192-255=3
+  return static_cast<uint8_t>(gray >> 6);
+}
+
+// Hash-based noise dithering - survives downsampling without moiré artifacts
+// Uses integer hash to generate pseudo-random threshold per pixel
+static inline uint8_t quantizeNoise(int gray, int x, int y) {
+  gray = adjustPixel(gray);
+
+  // Generate noise threshold using integer hash (no regular pattern to alias)
+  uint32_t hash = static_cast<uint32_t>(x) * 374761393u + static_cast<uint32_t>(y) * 668265263u;
+  hash = (hash ^ (hash >> 13)) * 1274126177u;
+  const int threshold = static_cast<int>(hash >> 24);  // 0-255
+
+  // Map gray (0-255) to 4 levels with dithering
+  const int scaled = gray * 3;

-  // Determine which level pair we're between, then use dithering to pick one
  if (scaled < 255) {
-    // Between level 0 (black) and level 1 (dark gray)
-    // Use threshold to decide: if scaled value + dither > 255, go to level 1
    return (scaled + threshold >= 255) ? 1 : 0;
  } else if (scaled < 510) {
-    // Between level 1 (dark gray) and level 2 (light gray)
    return ((scaled - 255) + threshold >= 255) ? 2 : 1;
  } else {
-    // Between level 2 (light gray) and level 3 (white)
    return ((scaled - 510) + threshold >= 255) ? 3 : 2;
  }
 }

+// Main quantization function - selects between methods based on config
+static inline uint8_t quantize(int gray, int x, int y) {
+  if (USE_NOISE_DITHERING) {
+    return quantizeNoise(gray, x, y);
+  } else {
+    return quantizeSimple(gray);
+  }
+}
+
+// Atkinson dithering - distributes only 6/8 (75%) of error for cleaner results
+// Error distribution pattern:
+//     X  1/8 1/8
+// 1/8 1/8 1/8
+//     1/8
+// Less error buildup = fewer artifacts than Floyd-Steinberg
+class AtkinsonDitherer {
+ public:
+  AtkinsonDitherer(int width) : width(width) {
+    errorRow0 = new int16_t[width + 4]();  // Current row
+    errorRow1 = new int16_t[width + 4]();  // Next row
+    errorRow2 = new int16_t[width + 4]();  // Row after next
+  }
+
+  ~AtkinsonDitherer() {
+    delete[] errorRow0;
+    delete[] errorRow1;
+    delete[] errorRow2;
+  }
+
+  uint8_t processPixel(int gray, int x) {
+    // Apply brightness/contrast/gamma adjustments
+    gray = adjustPixel(gray);
+
+    // Add accumulated error
+    int adjusted = gray + errorRow0[x + 2];
+    if (adjusted < 0) adjusted = 0;
+    if (adjusted > 255) adjusted = 255;
+
+    // Quantize to 4 levels
+    uint8_t quantized;
+    int quantizedValue;
+    if (adjusted < 43) {
+      quantized = 0;
+      quantizedValue = 0;
+    } else if (adjusted < 128) {
+      quantized = 1;
+      quantizedValue = 85;
+    } else if (adjusted < 213) {
+      quantized = 2;
+      quantizedValue = 170;
+    } else {
+      quantized = 3;
+      quantizedValue = 255;
+    }
+
+    // Calculate error (only distribute 6/8 = 75%)
+    int error = (adjusted - quantizedValue) >> 3;  // error/8
+
+    // Distribute 1/8 to each of 6 neighbors
+    errorRow0[x + 3] += error;      // Right
+    errorRow0[x + 4] += error;      // Right+1
+    errorRow1[x + 1] += error;      // Bottom-left
+    errorRow1[x + 2] += error;      // Bottom
+    errorRow1[x + 3] += error;      // Bottom-right
+    errorRow2[x + 2] += error;      // Two rows down
+
+    return quantized;
+  }
+
+  void nextRow() {
+    int16_t* temp = errorRow0;
+    errorRow0 = errorRow1;
+    errorRow1 = errorRow2;
+    errorRow2 = temp;
+    memset(errorRow2, 0, (width + 4) * sizeof(int16_t));
+  }
+
+  void reset() {
+    memset(errorRow0, 0, (width + 4) * sizeof(int16_t));
+    memset(errorRow1, 0, (width + 4) * sizeof(int16_t));
+    memset(errorRow2, 0, (width + 4) * sizeof(int16_t));
+  }
+
+ private:
+  int width;
+  int16_t* errorRow0;
+  int16_t* errorRow1;
+  int16_t* errorRow2;
+};
+
+// Floyd-Steinberg error diffusion dithering with serpentine scanning
+// Serpentine scanning alternates direction each row to reduce "worm" artifacts
+// Error distribution pattern (left-to-right):
+//       X   7/16
+// 3/16 5/16 1/16
+// Error distribution pattern (right-to-left, mirrored):
+// 1/16 5/16 3/16
+//      7/16  X
+class FloydSteinbergDitherer {
+ public:
+  FloydSteinbergDitherer(int width) : width(width), rowCount(0) {
+    errorCurRow = new int16_t[width + 2]();   // +2 for boundary handling
+    errorNextRow = new int16_t[width + 2]();
+  }
+
+  ~FloydSteinbergDitherer() {
+    delete[] errorCurRow;
+    delete[] errorNextRow;
+  }
+
+  // Process a single pixel and return quantized 2-bit value
+  // x is the logical x position (0 to width-1), direction handled internally
+  uint8_t processPixel(int gray, int x, bool reverseDirection) {
+    // Add accumulated error to this pixel
+    int adjusted = gray + errorCurRow[x + 1];
+
+    // Clamp to valid range
+    if (adjusted < 0) adjusted = 0;
+    if (adjusted > 255) adjusted = 255;
+
+    // Quantize to 4 levels (0, 85, 170, 255)
+    uint8_t quantized;
+    int quantizedValue;
+    if (adjusted < 43) {
+      quantized = 0;
+      quantizedValue = 0;
+    } else if (adjusted < 128) {
+      quantized = 1;
+      quantizedValue = 85;
+    } else if (adjusted < 213) {
+      quantized = 2;
+      quantizedValue = 170;
+    } else {
+      quantized = 3;
+      quantizedValue = 255;
+    }
+
+    // Calculate error
+    int error = adjusted - quantizedValue;
+
+    // Distribute error to neighbors (serpentine: direction-aware)
+    if (!reverseDirection) {
+      // Left to right: standard distribution
+      // Right: 7/16
+      errorCurRow[x + 2] += (error * 7) >> 4;
+      // Bottom-left: 3/16
+      errorNextRow[x] += (error * 3) >> 4;
+      // Bottom: 5/16
+      errorNextRow[x + 1] += (error * 5) >> 4;
+      // Bottom-right: 1/16
+      errorNextRow[x + 2] += (error) >> 4;
+    } else {
+      // Right to left: mirrored distribution
+      // Left: 7/16
+      errorCurRow[x] += (error * 7) >> 4;
+      // Bottom-right: 3/16
+      errorNextRow[x + 2] += (error * 3) >> 4;
+      // Bottom: 5/16
+      errorNextRow[x + 1] += (error * 5) >> 4;
+      // Bottom-left: 1/16
+      errorNextRow[x] += (error) >> 4;
+    }
+
+    return quantized;
+  }
+
+  // Call at the end of each row to swap buffers
+  void nextRow() {
+    // Swap buffers
+    int16_t* temp = errorCurRow;
+    errorCurRow = errorNextRow;
+    errorNextRow = temp;
+    // Clear the next row buffer
+    memset(errorNextRow, 0, (width + 2) * sizeof(int16_t));
+    rowCount++;
+  }
+
+  // Check if current row should be processed in reverse
+  bool isReverseRow() const { return (rowCount & 1) != 0; }
+
+  // Reset for a new image or MCU block
+  void reset() {
+    memset(errorCurRow, 0, (width + 2) * sizeof(int16_t));
+    memset(errorNextRow, 0, (width + 2) * sizeof(int16_t));
+    rowCount = 0;
+  }
+
+ private:
+  int width;
+  int rowCount;
+  int16_t* errorCurRow;
+  int16_t* errorNextRow;
+};
+
 inline void write16(Print& out, const uint16_t value) {
-  // out.write(reinterpret_cast<const uint8_t *>(&value), 2);
  out.write(value & 0xFF);
  out.write((value >> 8) & 0xFF);
 }

 inline void write32(Print& out, const uint32_t value) {
-  // out.write(reinterpret_cast<const uint8_t *>(&value), 4);
  out.write(value & 0xFF);
  out.write((value >> 8) & 0xFF);
  out.write((value >> 16) & 0xFF);
@ -64,13 +310,49 @@ inline void write32(Print& out, const uint32_t value) {
 }

 inline void write32Signed(Print& out, const int32_t value) {
-  // out.write(reinterpret_cast<const uint8_t *>(&value), 4);
  out.write(value & 0xFF);
  out.write((value >> 8) & 0xFF);
  out.write((value >> 16) & 0xFF);
  out.write((value >> 24) & 0xFF);
 }

+// Helper function: Write BMP header with 8-bit grayscale (256 levels)
+void writeBmpHeader8bit(Print& bmpOut, const int width, const int height) {
+  // Calculate row padding (each row must be multiple of 4 bytes)
+  const int bytesPerRow = (width + 3) / 4 * 4;  // 8 bits per pixel, padded
+  const int imageSize = bytesPerRow * height;
+  const uint32_t paletteSize = 256 * 4;  // 256 colors * 4 bytes (BGRA)
+  const uint32_t fileSize = 14 + 40 + paletteSize + imageSize;
+
+  // BMP File Header (14 bytes)
+  bmpOut.write('B');
+  bmpOut.write('M');
+  write32(bmpOut, fileSize);
+  write32(bmpOut, 0);                        // Reserved
+  write32(bmpOut, 14 + 40 + paletteSize);    // Offset to pixel data
+
+  // DIB Header (BITMAPINFOHEADER - 40 bytes)
+  write32(bmpOut, 40);
+  write32Signed(bmpOut, width);
+  write32Signed(bmpOut, -height);  // Negative height = top-down bitmap
+  write16(bmpOut, 1);              // Color planes
+  write16(bmpOut, 8);              // Bits per pixel (8 bits)
+  write32(bmpOut, 0);              // BI_RGB (no compression)
+  write32(bmpOut, imageSize);
+  write32(bmpOut, 2835);           // xPixelsPerMeter (72 DPI)
+  write32(bmpOut, 2835);           // yPixelsPerMeter (72 DPI)
+  write32(bmpOut, 256);            // colorsUsed
+  write32(bmpOut, 256);            // colorsImportant
+
+  // Color Palette (256 grayscale entries x 4 bytes = 1024 bytes)
+  for (int i = 0; i < 256; i++) {
+    bmpOut.write(static_cast<uint8_t>(i));  // Blue
+    bmpOut.write(static_cast<uint8_t>(i));  // Green
+    bmpOut.write(static_cast<uint8_t>(i));  // Red
+    bmpOut.write(static_cast<uint8_t>(0));  // Reserved
+  }
+}
+
 // Helper function: Write BMP header with 2-bit color depth
 void JpegToBmpConverter::writeBmpHeader(Print& bmpOut, const int width, const int height) {
  // Calculate row padding (each row must be multiple of 4 bytes)
@ -161,13 +443,60 @@ bool JpegToBmpConverter::jpegFileToBmpStream(File& jpegFile, Print& bmpOut) {
  Serial.printf("[%lu] [JPG] JPEG dimensions: %dx%d, components: %d, MCUs: %dx%d\n", millis(), imageInfo.m_width,
                imageInfo.m_height, imageInfo.m_comps, imageInfo.m_MCUSPerRow, imageInfo.m_MCUSPerCol);

-  // Write BMP header
-  writeBmpHeader(bmpOut, imageInfo.m_width, imageInfo.m_height);
+  // Safety limits to prevent memory issues on ESP32
+  constexpr int MAX_IMAGE_WIDTH = 2048;
+  constexpr int MAX_IMAGE_HEIGHT = 3072;
+  constexpr int MAX_MCU_ROW_BYTES = 65536;

-  // Calculate row parameters
-  const int bytesPerRow = (imageInfo.m_width * 2 + 31) / 32 * 4;
+  if (imageInfo.m_width > MAX_IMAGE_WIDTH || imageInfo.m_height > MAX_IMAGE_HEIGHT) {
+    Serial.printf("[%lu] [JPG] Image too large (%dx%d), max supported: %dx%d\n", millis(), imageInfo.m_width,
+                  imageInfo.m_height, MAX_IMAGE_WIDTH, MAX_IMAGE_HEIGHT);
+    return false;
+  }

-  // Allocate row buffer for packed 2-bit pixels
+  // Calculate output dimensions (pre-scale to fit display exactly)
+  int outWidth = imageInfo.m_width;
+  int outHeight = imageInfo.m_height;
+  // Use fixed-point scaling (16.16) for sub-pixel accuracy
+  uint32_t scaleX_fp = 65536;  // 1.0 in 16.16 fixed point
+  uint32_t scaleY_fp = 65536;
+  bool needsScaling = false;
+
+  if (USE_PRESCALE && (imageInfo.m_width > TARGET_MAX_WIDTH || imageInfo.m_height > TARGET_MAX_HEIGHT)) {
+    // Calculate scale to fit within target dimensions while maintaining aspect ratio
+    const float scaleToFitWidth = static_cast<float>(TARGET_MAX_WIDTH) / imageInfo.m_width;
+    const float scaleToFitHeight = static_cast<float>(TARGET_MAX_HEIGHT) / imageInfo.m_height;
+    const float scale = (scaleToFitWidth < scaleToFitHeight) ? scaleToFitWidth : scaleToFitHeight;
+
+    outWidth = static_cast<int>(imageInfo.m_width * scale);
+    outHeight = static_cast<int>(imageInfo.m_height * scale);
+
+    // Ensure at least 1 pixel
+    if (outWidth < 1) outWidth = 1;
+    if (outHeight < 1) outHeight = 1;
+
+    // Calculate fixed-point scale factors (source pixels per output pixel)
+    // scaleX_fp = (srcWidth << 16) / outWidth
+    scaleX_fp = (static_cast<uint32_t>(imageInfo.m_width) << 16) / outWidth;
+    scaleY_fp = (static_cast<uint32_t>(imageInfo.m_height) << 16) / outHeight;
+    needsScaling = true;
+
+    Serial.printf("[%lu] [JPG] Pre-scaling %dx%d -> %dx%d (fit to %dx%d)\n", millis(),
+                  imageInfo.m_width, imageInfo.m_height, outWidth, outHeight,
+                  TARGET_MAX_WIDTH, TARGET_MAX_HEIGHT);
+  }
+
+  // Write BMP header with output dimensions
+  int bytesPerRow;
+  if (USE_8BIT_OUTPUT) {
+    writeBmpHeader8bit(bmpOut, outWidth, outHeight);
+    bytesPerRow = (outWidth + 3) / 4 * 4;
+  } else {
+    writeBmpHeader(bmpOut, outWidth, outHeight);
+    bytesPerRow = (outWidth * 2 + 31) / 32 * 4;
+  }
+
+  // Allocate row buffer
  auto* rowBuffer = static_cast<uint8_t*>(malloc(bytesPerRow));
  if (!rowBuffer) {
    Serial.printf("[%lu] [JPG] Failed to allocate row buffer\n", millis());
@ -178,13 +507,48 @@ bool JpegToBmpConverter::jpegFileToBmpStream(File& jpegFile, Print& bmpOut) {
  // This is the minimal memory needed for streaming conversion
  const int mcuPixelHeight = imageInfo.m_MCUHeight;
  const int mcuRowPixels = imageInfo.m_width * mcuPixelHeight;
-  auto* mcuRowBuffer = static_cast<uint8_t*>(malloc(mcuRowPixels));
-  if (!mcuRowBuffer) {
-    Serial.printf("[%lu] [JPG] Failed to allocate MCU row buffer\n", millis());
+
+  // Validate MCU row buffer size before allocation
+  if (mcuRowPixels > MAX_MCU_ROW_BYTES) {
+    Serial.printf("[%lu] [JPG] MCU row buffer too large (%d bytes), max: %d\n", millis(), mcuRowPixels,
+                  MAX_MCU_ROW_BYTES);
    free(rowBuffer);
    return false;
  }

+  auto* mcuRowBuffer = static_cast<uint8_t*>(malloc(mcuRowPixels));
+  if (!mcuRowBuffer) {
+    Serial.printf("[%lu] [JPG] Failed to allocate MCU row buffer (%d bytes)\n", millis(), mcuRowPixels);
+    free(rowBuffer);
+    return false;
+  }
+
+  // Create ditherer if enabled (only for 2-bit output)
+  // Use OUTPUT dimensions for dithering (after prescaling)
+  AtkinsonDitherer* atkinsonDitherer = nullptr;
+  FloydSteinbergDitherer* fsDitherer = nullptr;
+  if (!USE_8BIT_OUTPUT) {
+    if (USE_ATKINSON) {
+      atkinsonDitherer = new AtkinsonDitherer(outWidth);
+    } else if (USE_FLOYD_STEINBERG) {
+      fsDitherer = new FloydSteinbergDitherer(outWidth);
+    }
+  }
+
+  // For scaling: accumulate source rows into scaled output rows
+  // We need to track which source Y maps to which output Y
+  // Using fixed-point: srcY_fp = outY * scaleY_fp (gives source Y in 16.16 format)
+  uint32_t* rowAccum = nullptr;      // Accumulator for each output X (32-bit for larger sums)
+  uint16_t* rowCount = nullptr;      // Count of source pixels accumulated per output X
+  int currentOutY = 0;               // Current output row being accumulated
+  uint32_t nextOutY_srcStart = 0;    // Source Y where next output row starts (16.16 fixed point)
+
+  if (needsScaling) {
+    rowAccum = new uint32_t[outWidth]();
+    rowCount = new uint16_t[outWidth]();
+    nextOutY_srcStart = scaleY_fp;   // First boundary is at scaleY_fp (source Y for outY=1)
+  }
+
  // Process MCUs row-by-row and write to BMP as we go (top-down)
  const int mcuPixelWidth = imageInfo.m_MCUWidth;

@ -207,75 +571,160 @@ bool JpegToBmpConverter::jpegFileToBmpStream(File& jpegFile, Print& bmpOut) {
        return false;
      }

-      // Process MCU block into MCU row buffer
-      // MCUs are composed of 8x8 blocks. For 16x16 MCUs, there are four 8x8 blocks:
-      // Block layout for 16x16 MCU:  [0, 64]  (top row of blocks)
-      //                              [128, 192] (bottom row of blocks)
+      // picojpeg stores MCU data in 8x8 blocks
+      // Block layout: H2V2(16x16)=0,64,128,192 H2V1(16x8)=0,64 H1V2(8x16)=0,128
      for (int blockY = 0; blockY < mcuPixelHeight; blockY++) {
        for (int blockX = 0; blockX < mcuPixelWidth; blockX++) {
          const int pixelX = mcuX * mcuPixelWidth + blockX;
+          if (pixelX >= imageInfo.m_width) continue;

-          // Skip pixels outside image width (can happen with MCU alignment)
-          if (pixelX >= imageInfo.m_width) {
-            continue;
-          }
+          // Calculate proper block offset for picojpeg buffer
+          const int blockCol = blockX / 8;
+          const int blockRow = blockY / 8;
+          const int localX = blockX % 8;
+          const int localY = blockY % 8;
+          const int blocksPerRow = mcuPixelWidth / 8;
+          const int blockIndex = blockRow * blocksPerRow + blockCol;
+          const int pixelOffset = blockIndex * 64 + localY * 8 + localX;

-          // Calculate which 8x8 block and position within that block
-          const int block8x8Col = blockX / 8;  // 0 or 1 for 16-wide MCU
-          const int block8x8Row = blockY / 8;  // 0 or 1 for 16-tall MCU
-          const int pixelInBlockX = blockX % 8;
-          const int pixelInBlockY = blockY % 8;
-
-          // Calculate byte offset: each 8x8 block is 64 bytes
-          // Blocks are arranged: [0, 64], [128, 192]
-          const int blockOffset = (block8x8Row * (mcuPixelWidth / 8) + block8x8Col) * 64;
-          const int mcuIndex = blockOffset + pixelInBlockY * 8 + pixelInBlockX;
-
-          // Get grayscale value
          uint8_t gray;
          if (imageInfo.m_comps == 1) {
-            // Grayscale image
-            gray = imageInfo.m_pMCUBufR[mcuIndex];
+            gray = imageInfo.m_pMCUBufR[pixelOffset];
          } else {
-            // RGB image - convert to grayscale
-            const uint8_t r = imageInfo.m_pMCUBufR[mcuIndex];
-            const uint8_t g = imageInfo.m_pMCUBufG[mcuIndex];
-            const uint8_t b = imageInfo.m_pMCUBufB[mcuIndex];
-            // Luminance formula: Y = 0.299*R + 0.587*G + 0.114*B
-            // Using integer approximation: (30*R + 59*G + 11*B) / 100
-            gray = (r * 30 + g * 59 + b * 11) / 100;
+            const uint8_t r = imageInfo.m_pMCUBufR[pixelOffset];
+            const uint8_t g = imageInfo.m_pMCUBufG[pixelOffset];
+            const uint8_t b = imageInfo.m_pMCUBufB[pixelOffset];
+            gray = (r * 25 + g * 50 + b * 25) / 100;
          }

-          // Store grayscale value in MCU row buffer
          mcuRowBuffer[blockY * imageInfo.m_width + pixelX] = gray;
        }
      }
    }

-    // Write all pixel rows from this MCU row to BMP file
+    // Process source rows from this MCU row
    const int startRow = mcuY * mcuPixelHeight;
    const int endRow = (mcuY + 1) * mcuPixelHeight;

    for (int y = startRow; y < endRow && y < imageInfo.m_height; y++) {
-      memset(rowBuffer, 0, bytesPerRow);
+      const int bufferY = y - startRow;

-      // Pack 4 pixels per byte (2 bits each)
-      for (int x = 0; x < imageInfo.m_width; x++) {
-        const int bufferY = y - startRow;
-        const uint8_t gray = mcuRowBuffer[bufferY * imageInfo.m_width + x];
-        const uint8_t twoBit = grayscaleTo2Bit(gray, x, y);
+      if (!needsScaling) {
+        // No scaling - direct output (1:1 mapping)
+        memset(rowBuffer, 0, bytesPerRow);

-        const int byteIndex = (x * 2) / 8;
-        const int bitOffset = 6 - ((x * 2) % 8);  // 6, 4, 2, 0
-        rowBuffer[byteIndex] |= (twoBit << bitOffset);
+        if (USE_8BIT_OUTPUT) {
+          for (int x = 0; x < outWidth; x++) {
+            const uint8_t gray = mcuRowBuffer[bufferY * imageInfo.m_width + x];
+            rowBuffer[x] = adjustPixel(gray);
+          }
+        } else {
+          for (int x = 0; x < outWidth; x++) {
+            const uint8_t gray = mcuRowBuffer[bufferY * imageInfo.m_width + x];
+            uint8_t twoBit;
+            if (atkinsonDitherer) {
+              twoBit = atkinsonDitherer->processPixel(gray, x);
+            } else if (fsDitherer) {
+              twoBit = fsDitherer->processPixel(gray, x, fsDitherer->isReverseRow());
+            } else {
+              twoBit = quantize(gray, x, y);
+            }
+            const int byteIndex = (x * 2) / 8;
+            const int bitOffset = 6 - ((x * 2) % 8);
+            rowBuffer[byteIndex] |= (twoBit << bitOffset);
+          }
+          if (atkinsonDitherer) atkinsonDitherer->nextRow();
+          else if (fsDitherer) fsDitherer->nextRow();
+        }
+        bmpOut.write(rowBuffer, bytesPerRow);
+      } else {
+        // Fixed-point area averaging for exact fit scaling
+        // For each output pixel X, accumulate source pixels that map to it
+        // srcX range for outX: [outX * scaleX_fp >> 16, (outX+1) * scaleX_fp >> 16)
+        const uint8_t* srcRow = mcuRowBuffer + bufferY * imageInfo.m_width;
+
+        for (int outX = 0; outX < outWidth; outX++) {
+          // Calculate source X range for this output pixel
+          const int srcXStart = (static_cast<uint32_t>(outX) * scaleX_fp) >> 16;
+          const int srcXEnd = (static_cast<uint32_t>(outX + 1) * scaleX_fp) >> 16;
+
+          // Accumulate all source pixels in this range
+          int sum = 0;
+          int count = 0;
+          for (int srcX = srcXStart; srcX < srcXEnd && srcX < imageInfo.m_width; srcX++) {
+            sum += srcRow[srcX];
+            count++;
+          }
+
+          // Handle edge case: if no pixels in range, use nearest
+          if (count == 0 && srcXStart < imageInfo.m_width) {
+            sum = srcRow[srcXStart];
+            count = 1;
+          }
+
+          rowAccum[outX] += sum;
+          rowCount[outX] += count;
+        }
+
+        // Check if we've crossed into the next output row
+        // Current source Y in fixed point: y << 16
+        const uint32_t srcY_fp = static_cast<uint32_t>(y + 1) << 16;
+
+        // Output row when source Y crosses the boundary
+        if (srcY_fp >= nextOutY_srcStart && currentOutY < outHeight) {
+          memset(rowBuffer, 0, bytesPerRow);
+
+          if (USE_8BIT_OUTPUT) {
+            for (int x = 0; x < outWidth; x++) {
+              const uint8_t gray = (rowCount[x] > 0) ? (rowAccum[x] / rowCount[x]) : 0;
+              rowBuffer[x] = adjustPixel(gray);
+            }
+          } else {
+            for (int x = 0; x < outWidth; x++) {
+              const uint8_t gray = (rowCount[x] > 0) ? (rowAccum[x] / rowCount[x]) : 0;
+              uint8_t twoBit;
+              if (atkinsonDitherer) {
+                twoBit = atkinsonDitherer->processPixel(gray, x);
+              } else if (fsDitherer) {
+                twoBit = fsDitherer->processPixel(gray, x, fsDitherer->isReverseRow());
+              } else {
+                twoBit = quantize(gray, x, currentOutY);
+              }
+              const int byteIndex = (x * 2) / 8;
+              const int bitOffset = 6 - ((x * 2) % 8);
+              rowBuffer[byteIndex] |= (twoBit << bitOffset);
+            }
+            if (atkinsonDitherer) atkinsonDitherer->nextRow();
+            else if (fsDitherer) fsDitherer->nextRow();
+          }
+
+          bmpOut.write(rowBuffer, bytesPerRow);
+          currentOutY++;
+
+          // Reset accumulators for next output row
+          memset(rowAccum, 0, outWidth * sizeof(uint32_t));
+          memset(rowCount, 0, outWidth * sizeof(uint16_t));
+
+          // Update boundary for next output row
+          nextOutY_srcStart = static_cast<uint32_t>(currentOutY + 1) * scaleY_fp;
+        }
      }
-
-      // Write row with padding
-      bmpOut.write(rowBuffer, bytesPerRow);
    }
  }

  // Clean up
+  if (rowAccum) {
+    delete[] rowAccum;
+  }
+  if (rowCount) {
+    delete[] rowCount;
+  }
+  if (atkinsonDitherer) {
+    delete atkinsonDitherer;
+  }
+  if (fsDitherer) {
+    delete fsDitherer;
+  }
  free(mcuRowBuffer);
  free(rowBuffer);

--- a/lib/JpegToBmpConverter/JpegToBmpConverter.h
+++ b/lib/JpegToBmpConverter/JpegToBmpConverter.h
@ -6,7 +6,7 @@ class ZipFile;

 class JpegToBmpConverter {
  static void writeBmpHeader(Print& bmpOut, int width, int height);
-  static uint8_t grayscaleTo2Bit(uint8_t grayscale, int x, int y);
+  // [COMMENTED OUT] static uint8_t grayscaleTo2Bit(uint8_t grayscale, int x, int y);
  static unsigned char jpegReadCallback(unsigned char* pBuf, unsigned char buf_size,
                                        unsigned char* pBytes_actually_read, void* pCallback_data);

--- a/src/activities/reader/ReaderActivity.cpp
+++ b/src/activities/reader/ReaderActivity.cpp
@ -15,6 +15,10 @@ std::unique_ptr<Epub> ReaderActivity::loadEpub(const std::string& path) {

  auto epub = std::unique_ptr<Epub>(new Epub(path, "/.crosspoint"));
  if (epub->load()) {
+    // Pre-generate cover BMP for sleep screen (so it's cached when entering sleep)
+    if (!epub->generateCoverBmp()) {
+      Serial.printf("[%lu] [RDR] Cover BMP generation skipped or failed\n", millis());
+    }
    return epub;
  }