Source code for lecture2notes.end_to_end.cluster

import logging
import os
import shutil

from PIL import Image
from tqdm import tqdm

from ..models.slide_classifier import inference
from ..models.slide_classifier.class_cluster_scikit import (  # pylint: disable=import-error,wrong-import-position
    Cluster,
)
from ..models.slide_classifier.custom_nnmodules import *  # noqa: F403,F401
from .helpers import make_dir_if_not_exist

logger = logging.getLogger(__name__)


[docs]class ClusterFilesystem(Cluster): """Clusters images from a directory and saves them to disk in folders corresponding to each centroid.""" def __init__( self, slides_dir, algorithm_name="kmeans", num_centroids=20, preference=None, damping=0.5, max_iter=200, model_path="model_best.ckpt", ): self.slides_dir = slides_dir self.model = inference.load_model(model_path) super().__init__( algorithm_name=algorithm_name, preference=preference, damping=damping, max_iter=max_iter, )
[docs] def extract_and_add_features(self, copy=True): """Extracts features from the images in `slides_dir` and saves feature vectors with super().add()""" slides = os.listdir(self.slides_dir) num_slides = len(slides) logger.info("Extracting features from " + str(num_slides) + " slides") for idx, slide in tqdm( enumerate(slides), total=num_slides, desc="> AI Clustering Engine: Feature extraction", ): current_slide_path = os.path.join(self.slides_dir, slide) _, _, _, extracted_features = inference.get_prediction( self.model, Image.open(current_slide_path) ) super().add(extracted_features, slide) super().create_algorithm_if_none()
[docs] def transfer_to_filesystem(self, copy=True, create_best_samples_folder=True): """Uses `move_list` from super() to take all images in directory `slides_dir` and save each cluster to a subfolder in `cluster_dir` (directory in parent of `slides_dir`)""" cluster_dir = ( self.slides_dir.parents[0] / "slide_clusters" ) # cluster_dir = up one directory from slides_dir then into "slide_clusters" move_list = super().get_move_list() best_samples_path = None if create_best_samples_folder: closest_filenames = super().get_closest_sample_filenames_to_centroids() best_samples_path = cluster_dir / "best_samples" make_dir_if_not_exist(best_samples_path) for filename in closest_filenames: slide_path = self.slides_dir / filename shutil.copy(str(slide_path), str(best_samples_path)) for filename in tqdm( move_list, desc="> AI Clustering Engine: Move/copy into cluster folders" ): cluster_number = move_list[filename] current_slide_path = os.path.join(self.slides_dir, filename) current_cluster_path = cluster_dir / str(cluster_number) make_dir_if_not_exist(current_cluster_path) if copy: shutil.copy(str(current_slide_path), str(current_cluster_path)) else: shutil.move(str(current_slide_path), str(current_cluster_path)) return cluster_dir, best_samples_path
# def extract_features(slides_dir, copy=True): # """Clusters all images in directory `slides_dir` and saves each cluster to a subfolder in `cluster_dir` (directory in parent of `slides_dir`)""" # slides = os.listdir(slides_dir) # num_slides = len(slides) # cluster = Cluster(algorithm_name="affinity_propagation", preference=-8, damping=0.72) # print("> AI Clustering Engine: Extracting features from " + str(num_slides) + " slides") # for idx, slide in tqdm(enumerate(slides), total=num_slides, desc="> AI Clustering Engine: Feature extraction"): # current_slide_path = os.path.join(slides_dir, slide) # _, _, _, extracted_features = get_prediction(model, Image.open(current_slide_path)) # cluster.add(extracted_features, slide) # #cluster.calculate_best_k() # move_list = cluster.create_move_list() # num_clusters = cluster.get_num_clusters() # print("> AI Clustering Engine: Predicted Number of Clusters: " + str(num_clusters)) # return move_list # def transfer_to_filesystem(slides_dir, move_list, create_best_samples_folder=True): # cluster_dir = slides_dir.parents[0] / "slide_clusters" # cluster_dir = up one directory from slides_dir then into "slide_clusters" # if create_best_samples_folder: # for filename in tqdm(move_list, desc="> AI Clustering Engine: Move/copy into cluster folders"): # cluster_number = move_list[filename] # current_slide_path = os.path.join(slides_dir, filename) # current_cluster_path = cluster_dir / str(cluster_number) # make_dir_if_not_exist(current_cluster_path) # if copy: # shutil.copy(str(current_slide_path), str(current_cluster_path)) # else: # shutil.move(str(current_slide_path), str(current_cluster_path)) # return cluster_dir