Source code for lecture2notes.end_to_end.slide_structure_analysis

import json
import logging
import os
from functools import partial

import cv2
import numpy as np
import pandas as pd
import pytesseract
from skimage import img_as_float

# https://scikit-image.org/docs/stable/auto_examples/segmentation/plot_peak_local_max.html
from skimage.feature import peak_local_max
from tqdm import tqdm

from .helpers import frame_number_from_filename

prev_line_num = 0
logger = logging.getLogger(__name__)


[docs]def stroke_width(image): """ Determine the average stroke length in an image. Inspired by: https://stackoverflow.com/a/61914060. Other Links: * `cv2.distanceTransform Documentation <https://docs.opencv.org/3.4/d7/d1b/group__imgproc__misc.html#ga25c259e7e2fa2ac70de4606ea800f12f>`_ * `OpenCV Distance Transform Tutorial <https://docs.opencv.org/3.4/d2/dbd/tutorial_distance_transform.html>`_ * `Sckit-Image "Finding local maxima" <https://scikit-image.org/docs/stable/auto_examples/segmentation/plot_peak_local_max.html>`_ * `skimage.feature.peak_local_max <https://scikit-image.org/docs/stable/api/skimage.feature.html#skimage.feature.peak_local_max>`_ """ try: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) except cv2.error: # image not found return 0 gray_threshold = cv2.threshold( gray, 40, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU )[1] # plt.imshow(gray_threshold, "gray"), plt.show() dist = cv2.distanceTransform(gray_threshold, cv2.DIST_L2, 5) im = img_as_float(dist) coordinates = peak_local_max(im, min_distance=2) pixel_strength = [] for element in coordinates: y = element[0] x = element[1] pixel_strength.append(dist[y, x]) mean_pixel_strength = np.array(pixel_strength).mean() return mean_pixel_strength
[docs]def identify_title( tesseract_df, image, left_start_maximum=0.77, character_limit=3, enabled_checks=None, ): if enabled_checks is None: enabled_checks = [ "in_upper_third", "in_top_left", "large_stroke_width", "large_height", "meets_character_limit", ] image_height, image_width = image.shape[:2] # Critera to be classified as a title: # it is in the upper third of the image, # it is the first block and first paragraph as recognized by tesseract # its stroke width is larger than one standard deviation from the average stroke width # its average line height is greater than the average of all line heights # the x position of its top-left corner is lower than a ``image_width * left_start_maximum``, # the text line has more than three characters. all_checks_passed = True # If there is only one block and only one paragraph then the slide # might only contain the title. Disable the "large_stroke_width" and # "large_height" checks. if ( tesseract_df["block_num"].max() == tesseract_df["block_num"].min() and tesseract_df["par_num"].max() == tesseract_df["par_num"].min() ): enabled_checks = [ x for x in enabled_checks if x != "large_stroke_width" and x != "large_height" ] # Start by selecting the first block and first paragraph in that block first_block = tesseract_df[ (tesseract_df["block_num"] == 1) & (tesseract_df["par_num"] == 1) ] if all_checks_passed and "in_upper_third" in enabled_checks: in_upper_third = first_block["top"].mean() < image_height / 3 all_checks_passed = in_upper_third if all_checks_passed and "in_top_left" in enabled_checks: in_top_left = first_block["left"].mean() < image_width * left_start_maximum all_checks_passed = in_top_left if all_checks_passed and "meets_character_limit" in enabled_checks: meets_character_limit = len("".join(first_block["text"])) > character_limit all_checks_passed = meets_character_limit if all_checks_passed and "large_stroke_width" in enabled_checks: avg_stroke_width = tesseract_df["stroke_width"].mean() std_stroke_width = tesseract_df["stroke_width"].std() large_stroke_width = ( first_block["stroke_width"].mean() > avg_stroke_width + std_stroke_width ) all_checks_passed = large_stroke_width if all_checks_passed and "large_height" in enabled_checks: avg_height = tesseract_df["height"].mean() large_height = first_block["height"].mean() > avg_height all_checks_passed = large_height if all_checks_passed: raw_text = " ".join(first_block["text"]).strip() global_line_nums = list( set(first_block["global_line_num"]) ) # use `set` to remove duplicates return raw_text, global_line_nums return None
[docs]def analyze_structure( image, to_json=None, return_unstructured_text=True, gamma=0.1, beta=0.2, orient="index", extra_json=None, ): """Perform slide structure analysis. Args: image (np.array): Image to be processed as loaded with ``cv2.imread()``. to_json (str or bool, optional): Path to write json output or a boolean to return json data as a string. The default return value is a pd.DataFrame. Defaults to None. return_unstructured_text (bool, optiona): If the raw recognized text should be returned in addition to the other return values. gamma (float, optional): The percentage greater than or less than the average **stroke width** that a text line must meet to be classified as bold/subtitle or small text repsectively. Defaults to 0.1. beta (float, optional): The percentage greater than or less than the average **height** that a text line must meet to be classified as bold/subtitle or small text repsectively. This is greater than ``gamma`` because height is on a larger scale than gamma. Defaults to 0.2. orient (str, optional): The format of the output json data if ``to_json`` is set. The acceptable values can be found on the `pandas.DataFrame.to_json documentation <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html>`_. Defaults to "index". extra_json (dict, optional): Additional keys and values to add to the json output if ``to_json`` is enabled. Defaults to None. Returns: pd.DataFrame or str or tuple or ``None``: The default is to return a pd.DataFrame. However, setting ``to_json`` to a string will instead write json data to ``to_json`` and return the path to the data. Setting ``to_json`` to ``True`` will return the json data as a string. Setting ``return_unstructured_text`` returns the previously described data and the raw recognized text as a tuple. Will return ``None`` is no text is detected. """ def add_stroke_width_info(row): x = row["left"] y = row["top"] w = row["width"] h = row["height"] word = image[y : y + h, x : x + w] return stroke_width(word) prev_line_num = 0 def add_line_info(row): """ Adds the global line number (``line_num``) independent of page/block/paragraph to each row. Inspired by https://stackoverflow.com/a/53118102. """ global prev_line_num if row["word_num"] == 1: current_line_num = prev_line_num # noqa: F841 prev_line_num += 1 return prev_line_num def categorize_text(row, avg_height, avg_stroke_width, gamma, beta): # Categories: # -1: small text # 0: normal text # 1: subtitle/bold # 2: title text stroke_width = row["stroke_width"] height = row["height"] if stroke_width > avg_stroke_width * (1 + gamma) or height > avg_height * ( 1 + beta ): return 1 # subtitle/bold if stroke_width < avg_stroke_width * (1 - gamma) and height < avg_height * ( 1 - beta ): return -1 # small text return 0 # normal text image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # pd.set_option("display.max_rows", 300) data_df = pytesseract.image_to_data( image_rgb, output_type=pytesseract.Output.DATAFRAME ) data_df = data_df.dropna() # Remove 0 width or 0 height matches, which are mistakes data_df = data_df[(data_df["width"] != 0) & (data_df["height"] != 0)] # Remove empty text lines data_df = data_df[(data_df["text"] != "") & (data_df["text"] != " ")] if data_df.empty: logger.warn("No text detected in this slide") return None prev_line_num = 0 data_df["global_line_num"] = data_df.apply(add_line_info, axis=1) data_df["stroke_width"] = data_df.apply(add_stroke_width_info, axis=1) title_output = identify_title(data_df, image) if title_output is not None: title_text, title_global_line_nums = title_output # https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html grouped = data_df.groupby("global_line_num") grouped_text = grouped["text"].apply( lambda o: " ".join(o).strip() ) # https://stackoverflow.com/a/27298308 grouped_others = grouped.mean() lines = pd.concat((grouped_others, grouped_text), axis=1) avg_height = data_df["height"].mean() avg_stroke_width = data_df["stroke_width"].mean() _categorize_text = partial( categorize_text, avg_height=avg_height, avg_stroke_width=avg_stroke_width, gamma=gamma, beta=beta, ) lines["category"] = lines.apply(_categorize_text, axis=1) # Set the title lines to category 2. if title_output is not None: lines.loc[title_global_line_nums, "category"] = 2 lines.drop( [ "level", "page_num", "word_num", "height", "left", "top", "width", "conf", "stroke_width", ], axis="columns", inplace=True, ) columns_to_int = ["block_num", "par_num", "line_num"] lines[columns_to_int] = lines[columns_to_int].astype(int) # input(lines) non_small_text_series = lines.loc[lines["category"] != -1]["text"] raw_text = " ".join(non_small_text_series).strip() to_return = [] if to_json: if type(to_json) is bool: if extra_json: lines_dict = lines.to_dict() lines_dict.update(extra_json) json_data = json.dumps(lines_dict) else: json_data = lines.to_json() to_return.append(json_data) else: if extra_json: json_data = json.loads(lines.to_json(to_json, orient=orient)) json_data.update(extra_json) with open(to_json, "w+") as json_file: json.dump(json_data, json_file) else: json_data = lines.to_json(to_json, orient=orient) to_return.append(to_json) else: to_return.append(lines) if return_unstructured_text: to_return.append(raw_text) if len(to_return) == 1: return to_return[0] return to_return
[docs]def all_in_folder(path, do_rename=True, **kwargs): """Perform structure analysis and OCR on every file in folder using :meth:`~lecture2notes.end_to_end.slide_structure_analysis.analyze_structure`. Args: path (str): Directory containing images to process. do_rename (str, optional): Rename files to just their frame number. Defaults to True. ``**kwargs`` (dict, optional) is passed to :meth:`lecture2notes.end_to_end.slide_structure_analysis.analyze_structure`. Returns: tuple: (raw_texts, json_texts) A list of the raw text for each slide and a list of the json structure analysis data for each slide. """ raw_texts = [] json_texts = [] images = os.listdir(path) images.sort() for item in tqdm( images, total=len(images), desc="> Slide Structure Analysis: Progress" ): # logger.info("> OCR: Processing file " + item) current_path = os.path.join(path, item) if os.path.isfile(current_path): image = cv2.imread(current_path) frame_number = frame_number_from_filename(current_path) if do_rename: item_directory = os.path.dirname(item) file_extension = os.path.splitext(item)[1] new_path = os.path.join( path, item_directory, str(frame_number) + file_extension ) os.rename(current_path, new_path) frame_number = int(frame_number) analyze_structure_outputs = analyze_structure( image, to_json=True, extra_json={"frame_number": frame_number}, **kwargs ) if analyze_structure_outputs is not None: json_text, raw_text = analyze_structure_outputs raw_texts.append(raw_text) json_texts.append(json_text) return raw_texts, json_texts
[docs]def write_to_file(raw_texts, json_texts, raw_save_file, json_save_file): """Write the raw text in ``raw_texts`` to ``raw_save_file`` and the json data in ``json_texts`` to ``json_save_file``. Used to write results from :meth:`~lecture2notes.end_to_end.slide_structure_analysis.all_in_folder` to disk. Args: raw_texts (list): List of raw text outputs from :meth:`~lecture2notes.end_to_end.slide_structure_analysis.analyze_structure`. json_texts (list): List of json ssa outputs from :meth:`~lecture2notes.end_to_end.slide_structure_analysis.analyze_structure`. raw_save_file (str): The path to save the raw text. A ".txt" file. json_save_file (str): The path to save the json output. A ".json" file. """ logger.info("Writing raw text to file " + str(raw_save_file)) with open(raw_save_file, "w+") as file_results: for item in raw_texts: file_results.write(item + "\r\n") logger.debug("Raw text written to " + str(raw_save_file)) logger.info("Writing JSON text to file " + str(json_save_file)) with open(json_save_file, "w+") as file_results: file_results.write("[") for idx, item in enumerate(json_texts): # If this is the last json string, don't write the comma if len(json_texts) == idx + 1: file_results.write(item) else: file_results.write(item + ", ") file_results.write("]") logger.debug("JSON text written to " + str(json_save_file))
# analyze_structure(cv2.imread("test_data/MIT2_627F13_lec04-11.png"), "test.json") # outputs = all_in_folder("test_data") # write_to_file(outputs[0], outputs[1], "remove1.txt", "remove2.json")