Improve distinction between color and grayscale documents (#79)

Better differentiate color and grayscale documents:
- Look for colored pixels only in the intersection of the mask and quadrilateral
- Apply a white balance (grey world) to the document
- Exclude pixels with extreme luminance
- Erode segmentation mask
This commit is contained in:
pynicolas
2025-12-10 17:08:21 +01:00
committed by GitHub
parent 87433fa96a
commit fe91f3e241
10 changed files with 509 additions and 86 deletions

View File

@@ -0,0 +1,236 @@
/*
* Copyright 2025 Pierre-Yves Nicolas
*
* This program is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option)
* any later version.
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.fairscan.imageprocessing
import org.opencv.core.Core
import org.opencv.core.CvType
import org.opencv.core.CvType.CV_8UC1
import org.opencv.core.Mat
import org.opencv.core.Mat.zeros
import org.opencv.core.MatOfPoint
import org.opencv.core.Scalar
import org.opencv.core.Size
import org.opencv.imgproc.Imgproc
import org.opencv.imgproc.Imgproc.fillConvexPoly
import kotlin.math.roundToInt
import kotlin.math.sqrt
fun isColoredDocument(
img: Mat,
mask: Mask,
quad: Quad,
chromaThreshold: Double = 17.5,
proportionThreshold: Double = 0.0003,
luminanceMin: Double = 40.0,
luminanceMax: Double = 180.0
): Boolean {
// Work on a reasonable size (for correct performance)
val resizedImg = resizeForMaxPixels(img, 1024.0 * 768.0)
val workSize = resizedImg.size()
// 1) Compute doc mask (mask ∩ quad)
val docMask = documentMask(mask, quad, img.size(), workSize)
// 2) Apply white balance only inside document
val whiteBalanced = applyGrayWorldToDocument(resizedImg, docMask)
// 3) Convert to Lab, see https://en.wikipedia.org/wiki/CIELAB_color_space
val lab = Mat()
Imgproc.cvtColor(whiteBalanced, lab, Imgproc.COLOR_BGR2Lab)
// 4) Split Lab
val channels = ArrayList<Mat>()
Core.split(lab, channels)
val luminance = channels[0]
val a = channels[1]
val b = channels[2]
// 5) Compute chroma
val chroma = chroma(a, b)
val colorMask = Mat()
Imgproc.threshold(chroma, colorMask, chromaThreshold, 255.0, Imgproc.THRESH_BINARY)
colorMask.convertTo(colorMask, CvType.CV_8U)
// 6) Create luminance mask L ∈ [luminanceMin, luminanceMax]
val luminanceMask = Mat()
Core.inRange(luminance, Scalar(luminanceMin), Scalar(luminanceMax), luminanceMask)
// 7) Combine colorMask & luminanceMask & docMask
val tmp = Mat()
Core.bitwise_and(colorMask, luminanceMask, tmp)
val restrictedMask = Mat()
Core.bitwise_and(tmp, docMask, restrictedMask)
val coloredPixels = Core.countNonZero(restrictedMask)
val totalPixels = Core.countNonZero(docMask)
// 8) Cleanup
resizedImg.release()
whiteBalanced.release()
lab.release()
channels.forEach { it.release() }
chroma.release()
colorMask.release()
luminanceMask.release()
tmp.release()
restrictedMask.release()
docMask.release()
if (totalPixels == 0) return false
val proportion = coloredPixels.toDouble() / totalPixels.toDouble()
return proportion > proportionThreshold
}
private fun resizeForMaxPixels(img: Mat, maxPixels: Double): Mat {
val origPixels = img.width() * img.height()
if (origPixels <= maxPixels) {
return img.clone()
}
val scale = sqrt(maxPixels / origPixels)
val size = Size(img.width() * scale, img.height() * scale)
val resizedImg = Mat()
Imgproc.resize(img, resizedImg, size, 0.0, 0.0, Imgproc.INTER_AREA)
return resizedImg
}
private fun chroma(a: Mat, b: Mat): Mat {
val aFloat = Mat()
val bFloat = Mat()
a.convertTo(aFloat, CvType.CV_32F)
b.convertTo(bFloat, CvType.CV_32F)
val aShifted = Mat()
val bShifted = Mat()
Core.subtract(aFloat, Scalar(128.0), aShifted)
Core.subtract(bFloat, Scalar(128.0), bShifted)
val chroma = Mat()
Core.magnitude(aShifted, bShifted, chroma)
aFloat.release()
bFloat.release()
aShifted.release()
bShifted.release()
return chroma
}
private fun erodeBorder(mask: Mat, quad: Quad): Mat {
val minDim = quad.edges().minOf { it.norm() }
var k = (minDim * 0.02).roundToInt()
k = k.coerceIn(3, 15)
if (k % 2 == 0) k += 1
val kernel = Imgproc.getStructuringElement(
Imgproc.MORPH_ELLIPSE,
Size(k.toDouble(), k.toDouble())
)
val erodedMask = Mat()
Imgproc.morphologyEx(mask, erodedMask, Imgproc.MORPH_ERODE, kernel)
kernel.release()
return erodedMask
}
private fun documentMask(
mask: Mask,
quad: Quad,
origSize: Size,
workSize: Size,
): Mat {
val resizedMask = Mat()
val maskMat = mask.toMat()
Imgproc.resize(maskMat, resizedMask, workSize, 0.0, 0.0, Imgproc.INTER_AREA)
val resizedQuad = quad.scaledTo(
origSize.width, origSize.height, workSize.width, workSize.height
)
val erodedMask = erodeBorder(resizedMask, resizedQuad)
val quadMask = zeros(erodedMask.size(), CV_8UC1)
val pts = MatOfPoint(
resizedQuad.topLeft.toCv(), resizedQuad.topRight.toCv(), resizedQuad.bottomRight.toCv(), resizedQuad.bottomLeft.toCv())
fillConvexPoly(quadMask, pts, Scalar(255.0))
val docMask = Mat()
Core.bitwise_and(erodedMask, quadMask, docMask)
quadMask.release()
pts.release()
erodedMask.release()
resizedMask.release()
maskMat.release()
return docMask
}
fun applyGrayWorldToDocument(
img: Mat,
docMask: Mat,
): Mat {
require(img.type() == CvType.CV_8UC3)
val nonZero = Core.countNonZero(docMask)
if (nonZero == 0) {
docMask.release()
return img.clone()
}
// compute mean per channel on docMask (B,G,R)
val meanScalar = Core.mean(img, docMask) // Scalar(bMean, gMean, rMean, alpha)
val meanB = meanScalar.`val`[0]
val meanG = meanScalar.`val`[1]
val meanR = meanScalar.`val`[2]
// safety: avoid division by very small values
val eps = 1e-6
val meanBsafe = if (meanB < eps) eps else meanB
val meanGsafe = if (meanG < eps) eps else meanG
val meanRsafe = if (meanR < eps) eps else meanR
val meanGray = (meanBsafe + meanGsafe + meanRsafe) / 3.0
val scaleB = meanGray / meanBsafe
val scaleG = meanGray / meanGsafe
val scaleR = meanGray / meanRsafe
// apply per-channel scaling only on docMask
// convert to float
val imgF = Mat()
img.convertTo(imgF, CvType.CV_32FC3)
// build scales scalar in BGR order
val scales = Scalar(scaleB, scaleG, scaleR)
// prepare scaled full image (float)
val scaledF = Mat()
Core.multiply(imgF, scales, scaledF)
// convert scaledF back to 8U
val scaled8 = Mat()
scaledF.convertTo(scaled8, CvType.CV_8UC3)
// result = original copy, then copy scaled pixels where docMask != 0
val result = img.clone()
scaled8.copyTo(result, docMask)
// cleanup
imgF.release()
scaledF.release()
scaled8.release()
return result
}

View File

@@ -18,7 +18,6 @@ import org.fairscan.imageprocessing.quad.detectDocumentQuadFromProbmap
import org.fairscan.imageprocessing.quad.findQuadFromRightAngles
import org.fairscan.imageprocessing.quad.minAreaRect
import org.opencv.core.Core
import org.opencv.core.CvType
import org.opencv.core.Mat
import org.opencv.core.MatOfPoint
import org.opencv.core.MatOfPoint2f
@@ -62,10 +61,7 @@ fun detectDocumentQuad(mask: Mask, isLiveAnalysis: Boolean, minQuadAreaRatio: Do
}
private fun biggestContour(mat: Mat): Pair<MatOfPoint2f?, Double> {
val mat8u = Mat()
mat.convertTo(mat8u, CvType.CV_8UC1, 255.0)
val refinedMask = refineMask(mat8u)
val refinedMask = refineMask(mat)
val blurred = Mat()
Imgproc.GaussianBlur(refinedMask, blurred, Size(5.0, 5.0), 0.0)
@@ -116,7 +112,12 @@ fun refineMask(original: Mat): Mat {
return opened
}
fun extractDocument(inputMat: Mat, quad: Quad, rotationDegrees: Int): Mat {
fun extractDocument(
inputMat: Mat,
quad: Quad,
rotationDegrees: Int,
mask: Mask,
): Mat {
val widthTop = norm(quad.topLeft, quad.topRight)
val widthBottom = norm(quad.bottomLeft, quad.bottomRight)
val targetWidth = (widthTop + widthBottom) / 2
@@ -144,7 +145,8 @@ fun extractDocument(inputMat: Mat, quad: Quad, rotationDegrees: Int): Mat {
Imgproc.warpPerspective(inputMat, outputMat, transform, outputSize)
val resized = resize(outputMat, 1500.0)
val enhanced = enhanceCapturedImage(resized)
val isColored = isColoredDocument(inputMat, mask, quad)
val enhanced = enhanceCapturedImage(resized, isColored)
val rotated = rotate(enhanced, rotationDegrees)
return rotated

View File

@@ -81,9 +81,9 @@ fun createQuad(vertices: List<Point>): Quad {
return Quad(sorted[0], sorted[1], sorted[2], sorted[3])
}
fun Quad.scaledTo(fromWidth: Int, fromHeight: Int, toWidth: Int, toHeight: Int): Quad {
val scaleX = toWidth.toFloat() / fromWidth
val scaleY = toHeight.toFloat() / fromHeight
fun Quad.scaledTo(fromWidth: Double, fromHeight: Double, toWidth: Double, toHeight: Double): Quad {
val scaleX = toWidth / fromWidth
val scaleY = toHeight / fromHeight
return Quad(
topLeft = topLeft.scaled(scaleX, scaleY),
topRight = topRight.scaled(scaleX, scaleY),
@@ -92,6 +92,14 @@ fun Quad.scaledTo(fromWidth: Int, fromHeight: Int, toWidth: Int, toHeight: Int):
)
}
fun Point.scaled(scaleX: Float, scaleY: Float): Point {
fun Quad.scaledTo(fromWidth: Int, fromHeight: Int, toWidth: Int, toHeight: Int): Quad {
return scaledTo(
fromWidth.toDouble(),
fromHeight.toDouble(),
toWidth.toDouble(),
toHeight.toDouble())
}
fun Point.scaled(scaleX: Double, scaleY: Double): Point {
return Point((x * scaleX), (y * scaleY))
}

View File

@@ -22,8 +22,8 @@ import org.opencv.core.Size
import org.opencv.imgproc.Imgproc
import kotlin.math.max
fun enhanceCapturedImage(img: Mat): Mat {
return if (isColoredDocument(img)) {
fun enhanceCapturedImage(img: Mat, isColored: Boolean): Mat {
return if (isColored) {
val result = Mat()
Core.convertScaleAbs(img, result, 1.2, 10.0)
result
@@ -36,63 +36,6 @@ fun enhanceCapturedImage(img: Mat): Mat {
}
}
fun isColoredDocument(
img: Mat,
chromaThreshold: Double = 20.0,
proportionThreshold: Double = 0.001
): Boolean {
val lab = Mat()
Imgproc.cvtColor(img, lab, Imgproc.COLOR_BGR2Lab)
val channels = ArrayList<Mat>()
Core.split(lab, channels)
val a = channels[1]
val b = channels[2]
val aFloat = Mat()
val bFloat = Mat()
a.convertTo(aFloat, CvType.CV_32F)
b.convertTo(bFloat, CvType.CV_32F)
val aShifted = Mat()
val bShifted = Mat()
Core.subtract(aFloat, Scalar(128.0), aShifted)
Core.subtract(bFloat, Scalar(128.0), bShifted)
val aSq = Mat()
val bSq = Mat()
Core.multiply(aShifted, aShifted, aSq)
Core.multiply(bShifted, bShifted, bSq)
val sumSq = Mat()
Core.add(aSq, bSq, sumSq)
val chroma = Mat()
Core.sqrt(sumSq, chroma)
val mask = Mat()
Imgproc.threshold(chroma, mask, chromaThreshold, 1.0, Imgproc.THRESH_BINARY)
val coloredPixels = Core.countNonZero(mask)
val totalPixels = chroma.rows() * chroma.cols()
val proportion = coloredPixels.toDouble() / totalPixels.toDouble()
lab.release()
channels.forEach { it.release() }
aFloat.release()
bFloat.release()
aShifted.release()
bShifted.release()
aSq.release()
bSq.release()
sumSq.release()
chroma.release()
mask.release()
return proportion > proportionThreshold
}
private fun multiScaleRetinex(img: Mat): Mat {
val imageSize = img.size()
val maxDim = max(imageSize.width, imageSize.height)