Commit 1df480ed authored by Christian Reul's avatar Christian Reul

added NFC normalization in combinegt

parent b6b6c9a1
Pipeline #21868 passed with stage
in 6 minutes and 14 seconds
......@@ -5,6 +5,7 @@ from ocrtypoworkflow.ocrtypoalign.datastructs import Result, DisplayChar, Charac
from ocrtypoworkflow.util.io import glob_all
from ocrtypoworkflow.util.multiprocessing import parallel_map
import json
import unicodedata
class SingleProcessArgs(NamedTuple):
ocr: str
......@@ -19,7 +20,7 @@ def run_single(args: SingleProcessArgs):
try:
data = {
'ocr': open(ocr).read(),
'ocr': unicodedata.normalize('NFC', open(ocr).read()),
'typo1': open(typo1).read(),
'typo2': open(typo2).read(),
}
......@@ -37,6 +38,7 @@ def run_single(args: SingleProcessArgs):
lens = list(map(len, data.values()))
if any(lens[0] != l for l in lens):
print("Invalid lengths {} in files".format(lens, ocr))
print(ocr)
comb.isCorrected = True
comb.corrected = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment