I just started with python and opencv very recently so I am not super knowlegable in it yet.
I am trying to detect the greyish rectangular box in a list of images.
The current version of the code works quite good but I still have issues sometimes, and the greyish box is not detected correctly.
I would appreciate if someone with more experience than me could help me to consistently detect the greyish rectangular box correctly.
Is my approach even the best? Is there a better approach that I am just not aware of ?
(I only can use CPU currently)
This is the current version of the code:
import cv2
import matplotlib.pyplot as plt
import os
def process_and_save_images(input_dir, save_subdir="processed_7"):
# Ensure the output directory exists
save_dir = os.path.join(input_dir, save_subdir)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# List all files in the input directory
files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
# Filter out only the PNG files (you can add other formats if needed)
image_paths = [os.path.join(input_dir, f) for f in files if f.lower().endswith('.png')]
for image_path in image_paths:
# Load image, grayscale, adaptive threshold
image = cv2.imread(image_path)
# Crop the bottom third of the image
height, width, _ = image.shape
cropped = image[int(2*height/3):, :]
result_cropped = cropped.copy()
gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 51, 9)
# Fill rectangular contours
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, (255,255,255), -1)
# Morph open
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)
# Closing operation
closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel, iterations=2)
# Find the contour with the largest area
cnts = cv2.findContours(closing, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
if cnts:
largest_contour = max(cnts, key=cv2.contourArea)
# Draw the bounding rectangle of the largest contour
x, y, w, h = cv2.boundingRect(largest_contour)
cv2.rectangle(result_cropped, (x, y), (x + w, y + h), (36, 255, 12), 3)
else:
print(f"No contours found in {image_path}. Skipping...")
# Use matplotlib to display the images and save them
plt.figure(figsize=(20, 10))
plt.subplot(1, 4, 1)
plt.imshow(thresh, cmap='gray')
plt.title('Thresholded Image')
plt.subplot(1, 4, 2)
plt.imshow(opening, cmap='gray')
plt.title('Morphological Opening')
plt.subplot(1, 4, 3)
plt.imshow(closing, cmap='gray')
plt.title('Morphological Closing')
plt.subplot(1, 4, 4)
plt.imshow(cv2.cvtColor(result_cropped, cv2.COLOR_BGR2RGB))
plt.title('Image with Largest Rectangle')
plt.tight_layout()
# Save the figure
filename = os.path.basename(image_path).replace('.png', '_processed.png')
output_path = os.path.join(save_dir, filename)
plt.savefig(output_path)
plt.close() # Close the current figure to release memory
print("Processing and saving completed.")
# Example usage:
process_and_save_images('./temporal')
Here is the result for the set of images that I am trying the code with.
https://imgur.com/a/7C78U9l
Here is the original images:
https://imgur.com/a/XKB6KHR
Here is an example where the current approach fails:
I think you can use cv.grabCut
which is sort of designed to do things like this.
And a note on your code, you can use cv.HoughLines
at the final stage and find straight lines instead of contours which are sensitive to the extra parts you segment.
Anyways here is my code and I think it works fine but I only tested a handful of your images.
import cv2 as cv
import numpy as np
img = cv.imread(IMAGE)
height, width, _ = img.shape
p0 = (5, int(2*height/3))
p1 = (width - 5, height - 5)
color_mask = cv.inRange(img, (15, 15, 15), (100, 100, 100))
color_mask = color_mask // 255
color_masked = img * color_mask[:, :, None]
mask = np.zeros(img.shape[:2],np.uint8)
bgdModel = np.zeros((1,65),np.float64)
fgdModel = np.zeros((1,65),np.float64)
rect = (*p0, *p1)
cv.grabCut(color_masked, mask, rect, bgdModel, fgdModel, 10, cv.GC_INIT_WITH_RECT)
grab_mask = np.where((mask == 2) | (mask == 0), 0, 1).astype(np.uint8)
masked = color_masked * grab_mask[:,:, np.newaxis]
kernel = cv.getStructuringElement(cv.MORPH_RECT, (5, 5))
closing = cv.morphologyEx(masked, cv.MORPH_CLOSE, kernel, iterations=2)
opening = cv.morphologyEx(closing, cv.MORPH_OPEN, kernel, iterations=2)
canny = cv.Canny(opening, 50, 150)
lines = cv.HoughLinesP(canny, 0.5, np.pi/360, 50)
for line in lines:
line = line[0]
cv.line(img, (line[0], line[1]), (line[2], line[3]), (255, 0, 0))
box = cv.boundingRect(lines.reshape(-1, 2))
cv.rectangle(img, box, (0, 0, 255))
cv.imshow('Result', img)
cv.waitKey(0)
cv.destroyAllWindows()
It’s mostly youre code but I used cv.garbCut
to segment part of the caption (instead of thresholding) and at the end used cv.HoughLines
.
You may have to adjust some parameters and keep testing and manipulating code to work fine for each image. Although you might consider using OCR as M Ciel suggested. I think it’s a more standard approch.
Might not be super related but since the box is for subtitle perhaps you can try opencv along with pytesseract’s OCR function to detect the bounding box of any texts, and conjunct it with the binary mask you obtained?