From 1df480edbef7b844c836196a24fe4142b6412508 Mon Sep 17 00:00:00 2001 From: Christian Reul Date: Tue, 31 Mar 2020 11:34:15 +0200 Subject: [PATCH] added NFC normalization in combinegt --- ocrtypoworkflow/combinegt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrtypoworkflow/combinegt.py b/ocrtypoworkflow/combinegt.py index f6b39d4..1b27b68 100644 --- a/ocrtypoworkflow/combinegt.py +++ b/ocrtypoworkflow/combinegt.py @@ -5,6 +5,7 @@ from ocrtypoworkflow.ocrtypoalign.datastructs import Result, DisplayChar, Charac from ocrtypoworkflow.util.io import glob_all from ocrtypoworkflow.util.multiprocessing import parallel_map import json +import unicodedata class SingleProcessArgs(NamedTuple): ocr: str @@ -19,7 +20,7 @@ def run_single(args: SingleProcessArgs): try: data = { - 'ocr': open(ocr).read(), + 'ocr': unicodedata.normalize('NFC', open(ocr).read()), 'typo1': open(typo1).read(), 'typo2': open(typo2).read(), } @@ -37,6 +38,7 @@ def run_single(args: SingleProcessArgs): lens = list(map(len, data.values())) if any(lens[0] != l for l in lens): print("Invalid lengths {} in files".format(lens, ocr)) + print(ocr) comb.isCorrected = True comb.corrected = [ -- GitLab