Commit 6ce0c7ae authored by Alexander Gehrke's avatar Alexander Gehrke
Browse files

initial commit

parents
python implementation of pagexml format
=======================================
This library implements the [PRImA PAGE XML][1] layout format. High-level
functions are provided for parts relevant to basic segmentation, the rest of
the format is accessible via code generated by [generateDS][2].
[1]: https://www.primaresearch.org/tools/PAGELibraries
[2]: https://sourceforge.net/projects/generateds/
This diff is collapsed.
from pypagexml.core import *
PCGTS_CREATOR = "pypagexml"
\ No newline at end of file
from typing import Optional
from PIL import Image
import pypagexml.ds as ds
import pypagexml.ds.generated as dsgen
def new_document(metadata: Optional[ds.MetadataType], page: Optional[ds.PageType]) -> ds.PcGtsTypeSub:
doc = ds.PcGtsTypeSub(
metadata=metadata if metadata is not None else ds.MetadataTypeSub.default(),
page=page)
return doc
def new_document_from_image(path: str, metadata: Optional[ds.MetadataType]) -> ds.PcGtsTypeSub:
im = Image.open(path)
w = im.width
h = im.height
xres, yres = im.info['dpi']
return ds.PcGtsTypeSub(
metadata=metadata if metadata is not None else ds.MetadataTypeSub.default(),
page=ds.PageTypeSub(
imageFilename=path, imageWidth=w, imageHeight=h, imageXResolution=xres, imageYResolution=yres
)
)
import re
RE_POSTFIX_NUM = re.compile("^.*[^0-9]([0-9]+)$")
def maxid(regions):
def idnum(s):
m = RE_POSTFIX_NUM.match(s.get_id)
return m.groups() if m is not None else ()
return max([int(k)
for x in regions
for k in idnum(x)
])
class PageXml:
def __init__(self, pcgts: ds.PcGtsTypeSub):
self.pcgts = pcgts
self.text_maxid = maxid(pcgts.get_Page().get_TextRegion())
self.img_maxid = maxid(pcgts.get_Page().get_ImageRegion())
def next_text_id(self):
id = f"rtxt{self.text_maxid}"
self.text_maxid += 1
return id
def next_image_id(self):
id = f"rimg{self.img_maxid}"
self.img_maxid += 1
return id
def add_paragraph(self, p, coords: ds.Points, ptype="paragraph"):
page: ds.PageTypeSub = self.pcgts.get_Page()
page.TextRegion.append(
ds.TextRegionTypeSub(
id=self.next_text_id(), Coords=ds.CoordsTypeSub.with_points(coords), type_=ptype
)
)
def add_image(self, p, coords: ds.Points):
page: ds.PageTypeSub = self.pcgts.get_Page()
page.ImageRegion.append(
ds.ImageRegionTypeSub(
id=self.next_text_id(), Coords=ds.CoordsTypeSub.with_points(coords)
)
)
from .generated import *
from .subclasses import *
This diff is collapsed.
This diff is collapsed.
def iso_now():
from datetime import datetime
return datetime.utcnow().isoformat(timespec='seconds')
pillow
\ No newline at end of file
from setuptools import setup, find_packages
with open("README.md", "r") as fh:
long_description = fh.read()
setup(
name='pypagexml',
version='0.0.0',
packages=find_packages(),
long_description=long_description,
long_description_content_type="text/markdown",
include_package_data=True,
author="Alexander Gehrke",
author_email="gehrke@informatik.uni-wuerzburg.de",
url="https://gitlab2.informatik.uni-wuerzburg.de/alg81dm/pypagexml.git",
install_requires=open("requirements.txt").read().split(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
"License :: OSI Approved :: Apache Software License",
"Intended Audience :: Science/Research",
],
keywords=['pagexml', 'page segmentation', 'layout'],
data_files=[('', ["requirements.txt"])],
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment