Source code for lecture2notes.end_to_end.ocr

try:
    from PIL import Image
except ImportError:
    import Image

import logging
import os

import pytesseract
from tqdm import tqdm

logger = logging.getLogger(__name__)


[docs]def all_in_folder(path): """Perform OCR using ``pytesseract`` on every file in folder and return results""" results = [] images = os.listdir(path) images.sort() for item in tqdm(images, total=len(images), desc="> OCR: Progress"): # logger.info("> OCR: Processing file " + item) current_path = os.path.join(path, item) if os.path.isfile(current_path): ocr_result = pytesseract.image_to_string(Image.open(current_path)) results.append(ocr_result) logger.debug("Returning results") return results
[docs]def write_to_file(results, save_file): """Write everything stored in `results` to file at path `save_file`. Used to write results from `all_in_folder()` to `save_file`.""" file_results = open(save_file, "a+") logger.info("Writing results to file " + str(save_file)) for item in tqdm( results, total=len(results), desc="> OCR: Writing To File Progress" ): file_results.write(item + "\r\n") file_results.close() logger.debug("Results written to " + str(save_file))