| # ... |
| # |
| # Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org> |
| # |
| # This work is licensed under the terms of the GNU GPL, version 2 or |
| # later. See the COPYING file in the top-level directory. |
| |
| import re |
| import logging |
| |
| from avocado.utils import process |
| from avocado.utils.path import find_command, CmdNotFoundError |
| |
| def tesseract_available(expected_version): |
| try: |
| find_command('tesseract') |
| except CmdNotFoundError: |
| return False |
| res = process.run('tesseract --version') |
| try: |
| version = res.stdout_text.split()[1] |
| except IndexError: |
| version = res.stderr_text.split()[1] |
| return int(version.split('.')[0]) >= expected_version |
| |
| match = re.match(r'tesseract\s(\d)', res) |
| if match is None: |
| return False |
| # now this is guaranteed to be a digit |
| return int(match.groups()[0]) >= expected_version |
| |
| |
| def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3): |
| console_logger = logging.getLogger('tesseract') |
| console_logger.debug(image_path) |
| if tesseract_version == 4: |
| tesseract_args += ' --oem 1' |
| proc = process.run("tesseract {} {} stdout".format(tesseract_args, |
| image_path)) |
| lines = [] |
| for line in proc.stdout_text.split('\n'): |
| sline = line.strip() |
| if len(sline): |
| console_logger.debug(sline) |
| lines += [sline] |
| return lines |