Source code for lecture2notes.end_to_end.summarizer_class

import glob
import json
import logging
import os
from argparse import Namespace
from functools import wraps
from pathlib import Path
from shutil import rmtree
from timeit import default_timer as timer

# Step extract figures imports
# Step slide structure analysis imports
# Step perspective crop imports
# Step black border removal imports
from . import (
    border_removal,
    corner_crop_transform,
    figure_detection,
    imghash,
    sift_matcher,
    slide_structure_analysis,
)

# Step cluster slides imports
from .cluster import ClusterFilesystem

# Step extract frames imports
from .frames_extractor import extract_frames
from .helpers import copy_all, frame_number_from_filename, gen_unique_id
from .segment_cluster import SegmentCluster

# Step classify slides imports
from .slide_classifier import classify_frames
from .spell_check import SpellChecker
from .summarization_approaches import (
    cluster,
    full_sents,
    generic_abstractive,
    generic_extractive_sumy,
    get_complete_sentences,
    keyword_based_ext,
    structured_joined_sum,
)

# Step transcribe audio imports
from .transcribe import transcribe_main as transcribe

logger = logging.getLogger(__name__)


[docs]def time_this(f): @wraps(f) def decorated_function(*args, **kwargs): start_time = timer() function_outputs = f(*args, **kwargs) end_time = timer() - start_time return function_outputs, end_time return decorated_function
[docs]class LectureSummarizer: def __init__(self, params, **kwargs): if type(params) is str or type(params) is Path: with open(params, "r") as json_file: params = json.load(json_file) if type(params) is dict: # root_process_folder params = Namespace(**params) for name in kwargs: setattr(params, name, kwargs[name]) # Perform argument checks if ( params.transcription_method == "deepspeech" or params.transcription_method == "vosk" ) and params.transcribe_model_dir is None: logger.error( "DeepSpeech and Vosk methods requires --transcribe_model_dir to be set to the directory containing the deepspeech/vosk models. See the documentation for details." ) if (params.summarization_mods is not None) and ( "none" in params.summarization_mods and len(params.summarization_mods) > 1 ): # None and another option were specified logger.error( "If 'none' is specified in --summarization_mods then no other options can be selected." ) self.all_step_functions = [ self.step_extract_frames, self.step_classify_slides, self.step_black_border_removal, self.step_perspective_crop, self.step_cluster_slides, self.step_slide_structure_analysis, self.step_extract_figures, self.step_transcribe_audio, self.step_summarize, ] if params.spell_check: self.spell_checker = SpellChecker() self.final_data = { "structured_summary": None, "lecture_summary": None, "transcript": None, } self.params = params self.transcription_method_default = "vosk" self.root_process_folder = self.determine_root_path()
[docs] def determine_root_path(self): if self.params.process_dir == "automatic": self.root_process_folder = Path(os.path.dirname(self.params.video_path)) else: self.root_process_folder = Path(self.params.process_dir) if self.params.auto_id: unique_id = gen_unique_id(self.params.video_path, 12) self.root_process_folder = self.root_process_folder / unique_id if self.params.custom_id: self.root_process_folder = self.root_process_folder / self.params.custom_id return self.root_process_folder
[docs] @time_this def run_all(self): if self.params.skip_to <= 1: end_time = self.step_extract_frames()[1] logger.info("Stage 1 (Extract Frames) took %s", end_time) if self.params.skip_to <= 2: end_time = self.step_classify_slides()[1] logger.info("Stage 2 (Classify Slides) took %s", end_time) if self.params.skip_to <= 3: end_time = self.step_black_border_removal()[1] logger.info("Stage 3 (Border Removal) took %s", end_time) if self.params.skip_to <= 4: end_time = self.step_perspective_crop()[1] logger.info("Stage 4 (Perspective Crop) took %s", end_time) if self.params.skip_to <= 5: end_time = self.step_cluster_slides()[1] logger.info("Stage 5 (Cluster Slides) took %s", end_time) if self.params.skip_to <= 6: end_time = self.step_slide_structure_analysis()[1] logger.info("Stage 6 (SSA and OCR Slides) took %s", end_time) if self.params.skip_to <= 7: end_time = self.step_extract_figures()[1] logger.info("Stage 7 (Extract Figures) took %s", end_time) if self.params.skip_to <= 8: end_time = self.step_transcribe_audio()[1] logger.info("Stage 8 (Transcribe Audio) took %s", end_time) if self.params.skip_to <= 9: end_time = self.step_summarize()[1] logger.info("Stage 9 (Summarization) took %s", end_time) if self.params.remove: rmtree(self.root_process_folder)
[docs] @time_this def step_extract_frames(self): output_path = getattr( self.params, "extract_frames_output_path", self.root_process_folder / "frames", ) extract_frames( self.params.video_path, self.params.extract_frames_quality, output_path, self.params.extract_every_x_seconds, )
[docs] @time_this def step_classify_slides(self): frames_dir = getattr( self.params, "frames_dir", self.root_process_folder / "frames" ) frames_sorted_dir, _, _ = classify_frames( frames_dir, model_path=self.params.slide_classifier_model_path ) self.frames_sorted_dir = frames_sorted_dir
[docs] @time_this def step_black_border_removal(self): self.frames_sorted_dir = getattr( self, "frames_sorted_dir", self.root_process_folder / "frames_sorted" ) slides_dir = getattr( self.params, "slides_dir", self.frames_sorted_dir / "slide" ) slides_noborder_dir = getattr( self.params, "slides_noborder_dir", self.frames_sorted_dir / "slides_noborder", ) # Save first 'slide' frame number first_frame_num_file_path = getattr( self.params, "first_frame_num_file_path", self.root_process_folder / "first-frame-num.txt", ) first_slide_frame_filename = sorted(os.listdir(slides_dir))[0] self.first_slide_frame_num = frame_number_from_filename( first_slide_frame_filename ) with open(first_frame_num_file_path, "a") as first_frame_num_file: first_frame_num_file.write(str(self.first_slide_frame_num)) if os.path.exists(slides_dir): os.makedirs(slides_noborder_dir, exist_ok=True) if self.params.remove_duplicates: images_hashed = imghash.sort_by_duplicates(slides_dir) imghash.remove_duplicates(slides_dir, images_hashed) removed_borders_paths = border_removal.all_in_folder(slides_dir) copy_all(removed_borders_paths, slides_noborder_dir) self.slides_noborder_dir = slides_noborder_dir
[docs] @time_this def step_perspective_crop(self): self.frames_sorted_dir = getattr( self, "frames_sorted_dir", self.root_process_folder / "frames_sorted" ) self.slides_noborder_dir = getattr( self, "slides_noborder_dir", self.frames_sorted_dir / "slides_noborder" ) presenter_slide_dir = getattr( self.params, "presenter_slide_dir", self.frames_sorted_dir / "presenter_slide", ) imgs_to_cluster_dir = getattr( self.params, "imgs_to_cluster_dir", self.frames_sorted_dir / "imgs_to_cluster", ) if os.path.exists(presenter_slide_dir): if self.params.remove_duplicates: logger.info( "Stage 4 (Duplicate Removal & Perspective Crop): Remove 'presenter_slide' duplicates" ) imghash_start_time = timer() images_hashed = imghash.sort_by_duplicates(presenter_slide_dir) imghash.remove_duplicates(presenter_slide_dir, images_hashed) imghash_end_time = timer() - imghash_start_time logger.info( "Stage 4 (Duplicate Removal & Perspective Crop): Remove 'presenter_slide' duplicates took %s", imghash_end_time, ) logger.info("Stage 4 (Duplicate Removal & Perspective Crop): SIFT Matching") siftmatch_start_time = timer() ( non_unique_presenter_slides, transformed_image_paths, ) = sift_matcher.match_features( self.slides_noborder_dir, presenter_slide_dir ) siftmatch_end_time = timer() - siftmatch_start_time logger.info( "Stage 4 (Duplicate Removal & Perspective Crop): SIFT Matching took %s", siftmatch_end_time, ) # Remove all 'presenter_slide' images that are duplicates of 'slide' images # and all 'slide' images that are better represented by a 'presenter_slide' image for x in non_unique_presenter_slides: try: os.remove(x) except OSError: pass # If there are transformed images then the camera motion was steady and we # do not have to run `corner_crop_transform`. If camera motion was detected # then the `transformed_image_paths` list will be empty and `presenter_slide_dir` # will contain potentially unique 'presenter_slide' images that do not appear # in any images of the 'slide' class. if transformed_image_paths: copy_all(transformed_image_paths, imgs_to_cluster_dir) else: logger.info( "Stage 4 (Duplicate Removal & Perspective Crop): Corner Crop Transform" ) cornercrop_start_time = timer() cropped_imgs_paths = corner_crop_transform.all_in_folder( presenter_slide_dir, remove_original=False ) copy_all(cropped_imgs_paths, imgs_to_cluster_dir) cornercrop_end_time = timer() - cornercrop_start_time logger.info( "Stage 4 (Duplicate Removal & Perspective Crop): Corner Crop Transform took %s", cornercrop_end_time, )
[docs] @time_this def step_cluster_slides(self): self.frames_sorted_dir = getattr( self, "frames_sorted_dir", self.root_process_folder / "frames_sorted" ) self.imgs_to_cluster_dir = getattr( self, "imgs_to_cluster", self.frames_sorted_dir / "imgs_to_cluster" ) self.slides_noborder_dir = getattr( self, "slides_noborder_dir", self.frames_sorted_dir / "slides_noborder" ) copy_all(self.slides_noborder_dir, self.imgs_to_cluster_dir) if self.params.remove_duplicates: images_hashed = imghash.sort_by_duplicates(self.imgs_to_cluster_dir) imghash.remove_duplicates(self.imgs_to_cluster_dir, images_hashed) if self.params.cluster_method == "normal": cluster_filesystem = ClusterFilesystem( self.imgs_to_cluster_dir, algorithm_name="affinity_propagation", preference=-8, damping=0.72, max_iter=1000, model_path=self.params.slide_classifier_model_path, ) cluster_filesystem.extract_and_add_features() if self.params.tensorboard: cluster_filesystem.visualize(self.params.tensorboard) cluster_dir, best_samples_dir = cluster_filesystem.transfer_to_filesystem() elif self.params.cluster_method == "segment": segment_cluster = SegmentCluster( self.imgs_to_cluster_dir, model_path=self.params.slide_classifier_model_path, ) segment_cluster.extract_and_add_features() cluster_dir, best_samples_dir = segment_cluster.transfer_to_filesystem() else: print("Invalid `cluster_method` option") self.cluster_dir = cluster_dir self.best_samples_dir = best_samples_dir
[docs] @time_this def step_slide_structure_analysis(self): self.frames_sorted_dir = getattr( self, "frames_sorted_dir", self.root_process_folder / "frames_sorted" ) self.cluster_dir = getattr( self, "cluster_dir", self.frames_sorted_dir / "slide_clusters" ) self.best_samples_dir = getattr( self, "best_samples_dir", self.cluster_dir / "best_samples" ) ocr_raw_output_file = getattr( self.params, "ocr_raw_output_file", self.root_process_folder / "slide-ocr.txt", ) ocr_json_output_file = getattr( self.params, "ocr_json_output_file", self.root_process_folder / "slide-ssa.json", ) ocr_raw_text, ocr_json_data = slide_structure_analysis.all_in_folder( self.best_samples_dir ) if "ocr" in self.params.spell_check: ocr_raw_text = self.spell_checker.check_all(ocr_raw_text) slide_structure_analysis.write_to_file( ocr_raw_text, ocr_json_data, ocr_raw_output_file, ocr_json_output_file ) self.ocr_raw_output_file = ocr_raw_output_file self.ocr_json_output_file = ocr_json_output_file
[docs] @time_this def step_extract_figures(self): self.frames_sorted_dir = getattr( self, "frames_sorted_dir", self.root_process_folder / "frames_sorted" ) self.cluster_dir = getattr( self, "cluster_dir", self.frames_sorted_dir / "slide_clusters" ) self.best_samples_dir = getattr( self, "best_samples_dir", self.cluster_dir / "best_samples" ) self.ocr_json_output_file = getattr( self, "ocr_json_output_file", self.root_process_folder / "slide-ssa.json" ) figures_dir = getattr( self.params, "figures_dir", self.cluster_dir / "best_samples_figures" ) os.makedirs(figures_dir, exist_ok=True) figure_paths = figure_detection.all_in_folder( self.best_samples_dir, east=self.params.east_path ) copy_all(figure_paths, figures_dir, move=True) if os.path.isfile(self.ocr_json_output_file): with open(self.ocr_json_output_file, "r") as ssa_json_file: ssa = json.load(ssa_json_file) ssa = figure_detection.add_figures_to_ssa(ssa, figures_dir) with open(self.ocr_json_output_file, "w") as ssa_json_file: json.dump(ssa, ssa_json_file) self.figures_dir = figures_dir
[docs] @time_this def step_transcribe_audio(self): extract_from_video = self.params.video_path audio_path = getattr( self.params, "audio_path", self.root_process_folder / "audio.wav" ) transcript_output_file = getattr( self.params, "transcript_output_file", self.root_process_folder / "audio.txt", ) transcript_json_output_file = getattr( self.params, "transcript_json_output_file", self.root_process_folder / "audio.json", ) transcript_json = None yt_transcription_failed = False custom_transcription_failed = False if self.params.custom_transcript_check: identified_files = glob.glob( str(self.root_process_folder / self.params.custom_transcript_check) + ".*" ) if identified_files: transcript_path = identified_files[0] transcript, transcript_json = transcribe.caption_file_to_string( transcript_path ) if transcript is None: custom_transcription_failed = True else: custom_transcription_failed = True if (not self.params.custom_transcript_check) or ( self.params.custom_transcript_check and custom_transcription_failed ): if self.params.transcription_method == "youtube": yt_output_file = self.root_process_folder / "audio.vtt" try: transcript_path = transcribe.get_youtube_transcript( self.params.video_id, yt_output_file ) transcript, transcript_json = transcribe.caption_file_to_string( transcript_path ) except Exception: yt_transcription_failed = True self.params.transcription_method = self.transcription_method_default logger.error( "Error detected in grabbing transcript from YouTube. Falling back to " + self.transcription_method_default + " transcription." ) if self.params.transcription_method != "youtube" or yt_transcription_failed: transcribe.extract_audio(extract_from_video, audio_path) try: if self.params.chunk == "silence": chunk_dir = self.root_process_folder / "chunks" transcribe.chunk_by_silence(audio_path, chunk_dir) transcript, transcript_json = transcribe.process_chunks( chunk_dir, model_dir=self.params.transcribe_model_dir, method=self.params.transcription_method, ) if self.params.transcribe_segment_sentences: transcript, transcript_json = transcribe.segment_sentences( transcript, transcript_json ) elif self.params.chunk == "speech": stt_model = transcribe.load_model( self.params.transcription_method, model_dir=self.params.transcribe_model_dir, ) # Only DeepSpeech has a `sampleRate()` method but `stt_model` could contain # a DeepSpeech or Vosk model try: desired_sample_rate = stt_model.sampleRate() except AttributeError: # default sample rate to convert to is 16000 desired_sample_rate = 16000 segments, _, audio_length = transcribe.chunk_by_speech( audio_path, desired_sample_rate=desired_sample_rate ) transcript, transcript_json = transcribe.process_segments( segments, stt_model, method=self.params.transcription_method, audio_length=audio_length, do_segment_sentences=self.params.transcribe_segment_sentences, ) else: # if not chunking transcript, transcript_json = transcribe.transcribe_audio( audio_path, method=self.params.transcription_method, model=self.params.transcribe_model_dir, ) if self.params.transcribe_segment_sentences: transcript, transcript_json = transcribe.segment_sentences( transcript, transcript_json ) except Exception: logger.error( "Audio transcription failed. Retry by running this script with the skip_to parameter set to 6." ) raise if "transcript" in self.params.spell_check: transcript = self.spell_checker.check(transcript) transcribe.write_to_file( transcript, transcript_output_file, transcript_json, transcript_json_output_file, ) self.transcript = transcript self.final_data["transcript"] = transcript self.transcript_output_file = transcript_output_file self.transcript_json_output_file = transcript_json_output_file
[docs] @time_this def step_summarize(self): self.ocr_raw_output_file = getattr( self, "ocr_raw_output_file", self.root_process_folder / "slide-ocr.txt" ) with open(self.ocr_raw_output_file, "r") as ocr_file: ocr_results_flat = ocr_file.read() self.ocr_json_output_file = getattr( self, "ocr_json_output_file", self.root_process_folder / "slide-ssa.json" ) self.transcript_output_file = getattr( self, "transcript_output_file", self.root_process_folder / "audio.txt" ) if not hasattr(self, "transcript"): with open(self.transcript_output_file, "r") as transcript_file: self.transcript = transcript_file.read() self.transcript_json_output_file = getattr( self, "transcript_json_output_file", self.root_process_folder / "audio.json" ) self.extract_every_x_seconds = getattr(self, "extract_every_x_seconds", 1) lecture_summarized_output_file = self.root_process_folder / "summarized.txt" lecture_summarized_structured_output_file = ( self.root_process_folder / "summarized.json" ) ocr_results_flat = ocr_results_flat.replace("\n", " ").replace( "\r", "" ) # remove line breaks if ( self.params.summarization_structured != "none" and self.params.summarization_structured is not None ): logger.info("Stage 9 (Summarization): Structured Summarization") ss_start_time = timer() if self.params.summarization_structured == "structured_joined": # Get the frame number of the first 'slide' if hasattr(self, "first_slide_frame_num"): first_slide_frame_num = self.first_slide_frame_num else: first_frame_num_file_path = getattr( self.params, "first_frame_num_file_path", self.root_process_folder / "first-frame-num.txt", ) with open(first_frame_num_file_path, "r") as first_frame_num_file: first_slide_frame_num = first_frame_num_file.read() structured_summary = structured_joined_sum( self.ocr_json_output_file, self.transcript_json_output_file, frame_every_x=self.extract_every_x_seconds, ending_char=".", first_slide_frame_num=int(first_slide_frame_num), to_json=lecture_summarized_structured_output_file, summarization_method=self.params.structured_joined_summarization_method, abs_summarizer=self.params.structured_joined_abs_summarizer, ext_summarizer=self.params.structured_joined_ext_summarizer, hf_inference_api=self.params.abs_hf_api, ) self.final_data["structured_summary"] = structured_summary ss_end_time = timer() - ss_start_time logger.info("Stage 9 (Summarization): Structured took %s", ss_end_time) else: logger.info("Skipping structured summarization.") # Combination Algorithm logger.info("Stage 9 (Summarization): Combination Algorithm") if self.params.combination_algo == "only_asr": summarized_combined = self.transcript elif self.params.combination_algo == "only_slides": summarized_combined = ocr_results_flat elif self.params.combination_algo == "concat": summarized_combined = ocr_results_flat + self.transcript elif self.params.combination_algo == "full_sents": summarized_combined = full_sents(ocr_results_flat, self.transcript) elif self.params.combination_algo == "keyword_based": summarized_combined = keyword_based_ext(ocr_results_flat, self.transcript) else: # if no combination algorithm was specified, which should never happen since argparse checks logger.warn("No combination algorithm selected. Defaulting to `concat`.") summarized_combined = ocr_results_flat + self.transcript # Modifications logger.info("Stage 9 (Summarization): Modifications") if ( self.params.summarization_mods != "none" and self.params.summarization_mods is not None ): if "full_sents" in self.params.summarization_mods: summarized_mod = get_complete_sentences( summarized_combined, return_string=True ) else: summarized_mod = summarized_combined logger.debug("Skipping summarization_mods") # Extractive Summarization logger.info("Stage 9 (Summarization): Extractive") ext_start_time = timer() if ( self.params.summarization_ext != "none" and self.params.summarization_ext is not None ): # if extractive method was specified if self.params.summarization_ext == "cluster": summarized_ext = cluster( summarized_mod, title_generation=False, cluster_summarizer="abstractive", hf_inference_api=self.params.abs_hf_api, ) else: # one of the generic options was specified summarized_ext = generic_extractive_sumy( summarized_mod, algorithm=self.params.summarization_ext ) else: logger.debug("Skipping summarization_ext") ext_end_time = timer() - ext_start_time logger.info("Stage 9 (Summarization): Extractive took %s", ext_end_time) # Abstractive Summarization logger.info("Stage 9 (Summarization): Abstractive") abs_start_time = timer() if ( self.params.summarization_abs != "none" and self.params.summarization_abs is not None ): # if abstractive method was specified lecture_summarized = generic_abstractive( summarized_ext, self.params.summarization_abs, hf_inference_api=self.params.abs_hf_api_overall, ) else: # if no abstractive summarization method was specified lecture_summarized = summarized_ext logger.debug("Skipping summarization_abs") abs_end_time = timer() - abs_start_time logger.info("Stage 9 (Summarization): Abstractive took %s", abs_end_time) transcribe.write_to_file(lecture_summarized, lecture_summarized_output_file) self.final_data["lecture_summary"] = lecture_summarized