Commit 92db6f51 authored by tmb's avatar tmb
Browse files

merged

parents efb3b937 23c6eb47
syntax: glob
*.cmodel
MODELS/*.cmodel
.*
*~
......
This diff is collapsed.
......@@ -5,7 +5,7 @@
# -- page frame removal, noise removal, ...
# -- parallelize TIFF images
import sys,os,re,optparse,shutil
import sys,os,re,optparse,shutil,traceback
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
......@@ -135,7 +135,11 @@ def process1(t):
for image,arg in ocrolib.page_iterator([arg]):
assert n<2,"no multipage files with parallel processing; use -P 0"
print "===",arg,count,image.shape
process_image(image,arg,count)
try:
process_image(image,arg,count)
except:
traceback.print_exc()
raise
n += 1
if options.parallel<2:
......
......@@ -628,7 +628,7 @@ def build_toolbar():
def main():
global main_widget_tree,class_selector,cluster_viewer,info_area
gladefile = ocrolib.findfile("gui/ocropus-cedit.glade")
gladefile = ocrolib.findfile("gui/ocroold-cedit.glade")
windowname = "window1"
main_widget_tree = gtk.glade.XML(gladefile)
dic = {
......
#!/usr/bin/python
import code,pickle,sys,os,re
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from pylab import *
from optparse import OptionParser
import ocrolib
from ocrolib import dbtables,segrec,distcomp,docproc
parser = OptionParser("""
usage: %prog [options] chars.db cluster.db
Perform fast clustering of characters in a database using a fixed distance
measure. The resulting cluster databases are often small enough to be
labeled directly, or they can be clustered further using k-means.
This also updates the cluster id of the char in the original char.db to the id
of the corresponding cluster.
""")
parser.add_option("-D","--display",help="display chars",action="store_true")
parser.add_option("-v","--verbose",help="verbose output",action="store_true")
parser.add_option("-t","--table",help="table name",default="chars")
parser.add_option("-e","--epsilon",help="epsilon",type=float,default=0.1)
parser.add_option("-o","--overwrite",help="overwrite output if it exists",action="store_true")
class ScaledFE:
"""A feature extractor that only rescales the input image to fit into
a 32x32 (or, generally, r x r box) and normalizes the vector.
Parameters are r (size of the rescaled image), and normalize (can be
one of "euclidean", "max", "sum", or None)."""
def __init__(self,**kw):
self.r = 32
self.normalize = "euclidean"
ocrolib.set_params(self,kw)
def extract(self,image):
v = array(docproc.isotropic_rescale(image,self.r),'f')
if not hasattr(self,"normalize") or self.normalize is None:
pass
elif self.normalize=="euclidean":
v /= sqrt(sum(v**2))
elif self.normalize=="max":
v /= amax(v)
elif self.normalize=="sum":
v /= sum(abs(v))
return v
class DistComp:
def __init__(self):
self.data = None
self.count = []
def add(self,v):
v = v.ravel()
if self.data is None:
self.data = v.reshape(1,len(v))
else:
self.data = concatenate((self.data,v.ravel().reshape(1,len(v))),axis=0)
self.count.append(1.0)
def distances(self,v):
if self.data is None: return array([],'f')
v = v.ravel()
return array([norm(v-self.data[i]) for i in range(len(self.data))])
def find(self,v,eps):
if self.data is None: return -1
ds = self.distances(v)
i = argmin(ds)
if ds[i]>eps: return -1
return i
def merge(self,i,v,weight):
self.data[i,:] += v.ravel()*weight
def length(self):
return self.data.shape[0]
def counts(self,i):
return self.count[i]
def vector(self,i):
return self.data[i,:]
def nearest(self,v):
ds = self.distances(v)
i = argmin(ds)
return i
def test_DistComp():
dc = DistComp()
for i in range(33): dc.add(randn(17))
print dc.find(dc.data[3],0.5)
class FastCluster:
def __init__(self,eps=0.05):
self.eps = eps
self.ex = ScaledFE()
self.dc = DistComp()
self.classes = []
self.counts = []
self.total = 0
def add(self,c,cls=None):
self.total += 1
c /= sqrt(sum(c**2))
v = self.ex.extract(c)
i = self.dc.find(v,self.eps)
if i<0:
self.dc.add(v)
self.classes.append({cls:1})
self.counts.append(1)
return len(self.counts)-1
else:
self.classes[i][cls] = self.classes[i].get(cls,0)+1
self.counts[i] += 1
self.dc.merge(i,v,1.0/self.counts[i])
return i
def biniter(self):
for i in range(self.dc.length()):
key = ""
v = self.dc.vector(i)
count = self.dc.counts(i)
yield i,v,count,key
def cls(self,i):
classes = list(self.classes[i].items())
classes.sort(reverse=1,key=lambda x:x[1])
# print i,self.classes[i],classes
return classes[0]
def stats(self):
return " ".join([str(self.total),str(self.dc.length())])
def save(self,file):
table = dbtables.ClusterTable(file)
table.create(image="blob",cls="text",count="integer",classes="text",cluster="integer")
table.converter("image",dbtables.SmallImage())
for i,v,count,key in self.biniter():
image = array(v/amax(v)*255.0,'B')
r = int(math.sqrt(image.size))
assert r*r==image.size
image.shape = (r,r)
cls,count = self.cls(i)
classes = repr(self.classes[i])
table.set(image=image,cls=cls,count=count,classes=classes,cluster=i)
(options,args) = parser.parse_args()
if len(args)!=2:
parser.print_help()
sys.exit(0)
input = args[0]
output = args[1]
if os.path.exists(output):
if not options.overwrite:
sys.stderr.write("%s: already exists\n"%output)
sys.exit(1)
else:
os.unlink(output)
ion()
show()
# open the relevant tables
table = dbtables.Table(input,options.table)
table.converter("image",dbtables.SmallImage())
table.create(image="blob",cluster="integer",cls="integer")
binned = FastCluster(options.epsilon)
total = 0
for row in table.get():
# get the image and the class out of the record
raw = row.image
cls = row.cls
# don't store images that are too large
if raw.shape[0]>255 or raw.shape[1]>255: continue
# make sure the maximum is 1.0
raw = raw/float(amax(raw))
# add it to the binned clusterer
cluster = binned.add(raw,cls)
# measure and report progress
total+=1
if total%1000==0:
print "#",total,"chars",binned.stats()
# record which cluster the character was assigned to
table.execute("update chars set cluster=? where id=?",[cluster,row.id])
table.commit()
table.close()
# FIXME optionally perform k-means clustering here so that we can do
# everything in one step and keep the cluster labels updated more easily
# save the clustered data
print "#",binned.stats()
binned.save(output)
#!/usr/bin/python
import code,pickle,sys,os,re
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from pylab import *
from optparse import OptionParser
from scipy import stats
import ocrolib
from ocrolib import dbtables,quant,ocroold
parser = OptionParser("""
usage: %prog [options] chars.db clusters.db
Perform kmeans clustering of characters in a database. This is fairly slow, loads
all characters into memory, and can't be applied to big databases. It is usually
applied after epsilon clustering if a further reduction in size is desired.
""")
parser.add_option("-D","--display",help="display chars",action="store_true")
parser.add_option("-v","--verbose",help="verbose output",action="store_true")
parser.add_option("-t","--table",help="table name",default="chars")
parser.add_option("-k","--k",help="k",type=int,default=300)
parser.add_option("-m","--minvecs",help="minimum number of vectors in a cluster",type=int,default=3)
# parser.add_option("-M","--minchange",help="minimum number of changes (fraction)",type=float,default=0.005)
parser.add_option("-n","--niter",help="max number of iterations",type=int,default=100)
parser.add_option("-O","--outlier",help="outlier range",type=float,default=3.0)
from scipy import mgrid,linalg,ndimage
import sys,os,random,math
import numpy,pylab,scipy
from numpy import *
verbose = 1
(options,args) = parser.parse_args()
if len(args)!=2:
parser.print_help()
sys.exit(0)
input = args[0]
output = args[1]
ion()
show()
table = dbtables.Table(input,options.table)
table.converter("image",dbtables.SmallImage())
table.create(image="blob",cls="text",classes="text")
classes = [row[0] for row in table.query("select distinct(cls) from '%s' order by cls"%options.table)]
extractor = ocroold.ScaledFE()
data = []
print "loading"
for row in table.get():
raw = row.image
if raw.shape[0]>255 or raw.shape[1]>255: continue
c = raw/float(amax(raw))
v = extractor.extract(c)
data.append(v)
print "clustering"
data = array(data,'f')
print "data",data.shape
# minchange=max(1,int(options.minchange*len(data)))
means,counts = quant.kmeans(data,k=options.k,maxiter=options.niter,
# outlier=options.outlier,minvecs=options.minvecs
)
print "writing"
table = dbtables.ClusterTable(output)
table.create(image="blob",cls="text",count="integer",classes="text")
table.converter("image",dbtables.SmallImage())
for i in range(means.shape[0]):
v = means[i]
image = array(v/amax(v)*255.0,'B')
image.shape = (30,30)
table.set(image=image,cls="_",count=counts[i],classes="")
#!/usr/bin/python
import numpy,pylab,random,sqlite3,collections,os,re
from pylab import *
from scipy import linalg
from scipy.ndimage.morphology import binary_erosion
from scipy.ndimage import interpolation,filters
from ocrolib import dbhelper,improc
import cv,random,pyflann
from collections import Counter,defaultdict
from optparse import OptionParser
import shelve
import tables
from tables import *
from ocrolib import docproc
class record:
def __init__(self,**kw):
self.__dict__.update(kw)
def __str__(self):
return str(self.__dict__)
def uencode(s):
assert len(s)<=4
result = 0
for c in s[len(s)-1::-1]: result = (result<<16)|ord(c)
return result
def udecode(i):
result = []
while i!=0:
result.append(unichr(i&0xffff))
i >>= 16
return "".join(result)
def get_images(cname,table='chars'):
with sqlite3.connect(cname) as db:
db.row_factory = dbhelper.DbRow
query = "select * from %s"%table
rows = db.execute(query)
for row in rows:
image = array(improc.pad_by(dbhelper.blob2image(row.image),1),'B')
try:
rel = array([float(x) for x in row.rel.split()])
except:
rel = None
yield record(id=row.id,image=image,cls=row.cls,rel=rel,cost=row.cost,
segid=row.segid,count=row.count,cluster=row.cluster,bbox=row.bbox,
classes=row.classes,file=row.file)
del rows
def csnormalize(image,f=0.75):
bimage = 1*(image>mean([amax(image),amin(image)]))
w,h = bimage.shape
[xs,ys] = mgrid[0:w,0:h]
s = sum(bimage)
if s<1e-4: return image
s = 1.0/s
cx = sum(xs*bimage)*s
cy = sum(ys*bimage)*s
sxx = sum((xs-cx)**2*bimage)*s
sxy = sum((xs-cx)*(ys-cy)*bimage)*s
syy = sum((ys-cy)**2*bimage)*s
w,v = eigh(array([[sxx,sxy],[sxy,syy]]))
l = sqrt(amax(w))
scale = f*max(image.shape)/(4.0*l)
m = array([[1.0/scale,0],[0.0,1.0/scale]])
w,h = image.shape
c = array([cx,cy])
d = c-dot(m,array([w/2,h/2]))
image = interpolation.affine_transform(image,m,offset=d,order=1)
return image
def table_log(db,*args):
import time
db.setNodeAttr("/","LOG_%d"%int(time.time())," ".join(args))
import argparse
parser = argparse.ArgumentParser( description = "Convert character databases in SQLite3 format to HDF5 format.")
parser.add_argument('db',default='training.db',help="db file")
parser.add_argument('-o','--output',default=None,help="hdf5 ouput file")
parser.add_argument('-n','--nimages',type=int,default=2000000000,help="max # images to convert")
parser.add_argument('-r','--pattern',default='.*',help="pattern for characters to transform")
parser.add_argument('-p','--size',type=int,default=32,help="patchsize; 0 stores pickled Python arrays instead")
parser.add_argument('-N','--nonormalize',action="store_true",help="do not perform size normalization")
parser.add_argument('-t','--table',default="chars",help="database table")
parser.add_argument('-g','--nogeometry',action='store_true',help='do not copy over geometry information')
parser.add_argument('-e','--extended',action='store_true',help='copy over extended information')
parser.add_argument('-D','--display',type=int,default=0,help='if non-zero, display characters')
args = parser.parse_args()
assert args.output is not None
assert args.size>5 and args.size<256
h5 = tables.openFile(args.output,"w")
size = args.size
if size==0:
images = h5.createVLArray(h5.root,'images',ObjectAtom(),filters=Filters(9))
else:
patches = h5.createEArray(h5.root,'patches',Float32Atom(),shape=(0,size,size),
title="characters as patches from "+args.db,
filters=Filters(9))
classes = h5.createEArray(h5.root,'classes',Int64Atom(),shape=(0,),filters=tables.Filters(9))
table_log(h5,"%s"%sys.argv)
if not args.nogeometry:
rel = h5.createEArray(h5.root,'rel',Float32Atom(shape=(3,)),shape=(0,),filters=tables.Filters(9))
if args.extended:
cclasses = h5.createVLArray(h5.root,'cclasses',StringAtom(120),filters=Filters(9))
files = h5.createVLArray(h5.root,'files',StringAtom(120),filters=Filters(9))
bboxes = h5.createEArray(h5.root,'bboxes',Float32Atom(shape=(4,)),shape=(0,),filters=Filters(9))
costs = h5.createEArray(h5.root,'costs',Float32Atom(),shape=(0,),filters=tables.Filters(9))
segids = h5.createEArray(h5.root,'segids',Int16Atom(),shape=(0,),filters=tables.Filters(9))
clusters = h5.createEArray(h5.root,'clusters',Int32Atom(),shape=(0,),filters=tables.Filters(9))
print 'selecting patterns with classes matching: /^'+args.pattern+'$/'
if args.nonormalize:
print "normalizing by isotropic rescaling into",size,"x",size,"bounding box"
else:
print "normalizing by moments into",size,"x",size,"bounding box"
try:
for r in get_images(args.db,table=args.table):
if len(classes)>=args.nimages: break
if not re.match(args.pattern+"$",r.cls): continue
if len(classes)%1000==0: sys.stdout.write("%d\r"%len(classes)); sys.stdout.flush()
if not args.nogeometry:
assert rel is not None,"geometry information missing from database; use -g"
assert len(rel)==len(classes)
rel.append(r.rel)
bbox = [int(x) for x in r.bbox.split()]
if args.extended:
cclasses.append([r.classes])
files.append([r.file])
bboxes.append([bbox])
costs.append([r.cost])
segids.append([int(r.segid or 0)])
clusters.append([int(r.cluster or 0)])
rshape = r.image.shape
if size==0:
images.append(r.image)
else:
image = array(r.image,'f')/255.0
if not args.nonormalize:
image = docproc.isotropic_rescale(image,size)
image = csnormalize(image)
else:
image = docproc.isotropic_rescale(image,size-2)
image = improc.pad_by(image,1)
patches.append([image])
classes.append([uencode(r.cls)])
if args.display>0 and len(classes)%args.display==0:
ion(); gray(); clf(); imshow(image); ginput(1,0.001)
print r.id,r.cls,rshape,r.rel,r.cost,r.count,r.segid,r.classes,bbox
finally:
h5.close()
print "done"
......@@ -24,7 +24,7 @@ import gnome
from matplotlib import patches
import scipy
import ocrolib
from ocrolib import fstutils,utils
from ocrolib import fstutils
default_model = "2m2-reject.cmodel"
default_segmenter = "DpSegmenter"
......@@ -111,7 +111,7 @@ class LineWindow:
self.lmodel = None
self.linerec = None
gladefile = ocrolib.findfile("ocropus-gtedit.glade")
gladefile = ocrolib.findfile("gui/ocroold-gtedit.glade")
self.windowname = "gtedit"
self.wtree = gtk.glade.XML(gladefile,self.windowname)
self.window = self.wtree.get_widget(self.windowname)
......
......@@ -181,6 +181,7 @@ def process_arg(arg):
if not options.silent:
if ocrolib.quick_check_line_components(line,dpi=options.dpi)<0.5:
continue
assert (regions.id(i)&0xff0000)>0
if options.gray:
ocrolib.write_image_gray("%s/%06x.bin.png"%(base,regions.id(i)),line)
line = regions.extract(page_gray,i,options.pad)
......
-- check line size normalization in segrec
uw3unlv-240-4-60-g2.cmodel
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment