Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 110 additions & 40 deletions qurator/sbb_textline_detector/ocrd_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_modelfactory import page_from_file
from ocrd_models import OcrdFile
from ocrd_models.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType, \
CoordsType
from ocrd_models.ocrd_page_generateds import CoordsType, PageType
from ocrd_utils import (
assert_file_grp_cardinality,
getLogger,
make_file_id,
MIMETYPE_PAGE,
coordinates_for_segment,
polygon_from_points, points_from_polygon,
)
import numpy as np
from shapely.geometry import Polygon, asPolygon
from shapely.ops import unary_union

from pkg_resources import resource_string

Expand Down Expand Up @@ -63,8 +63,8 @@ def process(self):
page = pcgts.get_Page()
page_image, page_coords, page_image_info = \
self.workspace.image_from_page(
page, page_id,
feature_filter='cropped,binarized,grayscale_normalized'
page, page_id,
feature_filter='cropped,binarized,grayscale_normalized'
)

with tempfile.TemporaryDirectory() as tmp_dirname:
Expand All @@ -84,64 +84,134 @@ def process(self):

# Create a new PAGE file from the input file
pcgts.set_pcGtsId(file_id)
page = pcgts.get_Page()

# Merge results → PAGE file

# 1. Border
if page.get_Border():
log.warning("Page already contained a border")
log.warning("Removing existing page border")
page.set_Border(None)
# We need to translate the coordinates:
text_border = tmp_page.get_Border()
coords = text_border.get_Coords().get_points()
polygon = polygon_from_points(coords)
polygon_new = coordinates_for_segment(polygon, page_image, page_coords)
points_new = points_from_polygon(polygon_new)
coords_new = CoordsType(points=points_new)
text_border.set_Coords(coords_new)
page.set_Border(text_border)
text_border = adapt_coords(tmp_page.get_Border(), page, page_coords)
if text_border is None:
# intersection is empty (border outside of rotated original image)
log.warning("new border would be empty, skipping")
else:
page.set_Border(text_border)

# 2. ReadingOrder
if page.get_ReadingOrder():
log.warning("Page already contained a reading order")
log.warning("Removing existing regions' reading order")
page.set_ReadingOrder(tmp_page.get_ReadingOrder())

# 3. TextRegion
# FIXME: what about table and image regions?
if page.get_TextRegion():
log.warning("Page already contained text regions")
log.warning("Removing existing text regions")
# We need to translate the coordinates:
text_regions_new = []
for text_region in tmp_page.get_TextRegion():
coords = text_region.get_Coords().get_points()
polygon = polygon_from_points(coords)
polygon_new = coordinates_for_segment(polygon, page_image, page_coords)
points_new = points_from_polygon(polygon_new)
coords_new = CoordsType(points=points_new)
text_region.set_Coords(coords_new)
text_region = adapt_coords(text_region, page, page_coords)
if text_region is None:
# intersection is empty (polygon outside of above border)
log.warning("new text region polygon would be empty, skipping")
continue
text_regions_new.append(text_region)
text_lines_new = []
for text_line in text_region.get_TextLine():
text_line = adapt_coords(text_line, text_region, page_coords)
if text_line is None:
# intersection is empty (polygon outside of region)
log.warning("new text line polygon would be empty, skipping")
continue
text_lines_new.append(text_line)
text_region.set_TextLine(text_lines_new)
page.set_TextRegion(text_regions_new)

# Save metadata about this operation
metadata = pcgts.get_Metadata()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name, value=self.parameter[name])
for name in self.parameter.keys()])]))
self.add_metadata(pcgts)

self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=page_id,
mimetype='application/vnd.prima.page+xml',
local_filename=os.path.join(self.output_file_grp, file_id) + '.xml',
content=ocrd_models.ocrd_page.to_xml(pcgts)
ID=file_id,
file_grp=self.output_file_grp,
pageId=page_id,
mimetype='application/vnd.prima.page+xml',
local_filename=os.path.join(self.output_file_grp, file_id) + '.xml',
content=ocrd_models.ocrd_page.to_xml(pcgts)
)


def adapt_coords(segment, parent, transform):
points = segment.get_Coords().get_points()
polygon = polygon_from_points(points)
# polygon absolute coords (after transforming back from page coords, e.g. deskewing)
polygon_new = coordinates_for_segment(polygon, None, transform)
# intersection with parent polygon
polygon_new = polygon_for_parent(polygon_new, parent)
if polygon_new is None:
return None
points_new = points_from_polygon(polygon_new)
segment.set_Coords(CoordsType(points=points_new))
return segment

# from ocrd_tesserocr, to be integrated into core (somehow)...
def polygon_for_parent(polygon, parent):
"""Clip polygon to parent polygon range.

(Should be moved to ocrd_utils.coordinates_for_segment.)
"""
childp = Polygon(polygon)
if isinstance(parent, PageType):
if parent.get_Border():
parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points))
else:
parentp = Polygon([[0, 0], [0, parent.get_imageHeight()],
[parent.get_imageWidth(), parent.get_imageHeight()],
[parent.get_imageWidth(), 0]])
else:
parentp = Polygon(polygon_from_points(parent.get_Coords().points))
# check if clipping is necessary
if childp.within(parentp):
return polygon
# ensure input coords have valid paths (without self-intersection)
# (this can happen when shapes valid in floating point are rounded)
childp = make_valid(childp)
parentp = make_valid(parentp)
# clip to parent
interp = childp.intersection(parentp)
if interp.is_empty or interp.area == 0.0:
# this happens if Tesseract "finds" something
# outside of the valid Border of a deskewed/cropped page
# (empty corners created by masking); will be ignored
return None
if interp.type == 'GeometryCollection':
# heterogeneous result: filter zero-area shapes (LineString, Point)
interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
if interp.type == 'MultiPolygon':
# homogeneous result: construct convex hull to connect
# FIXME: construct concave hull / alpha shape
interp = interp.convex_hull
if interp.minimum_clearance < 1.0:
# follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity
interp = asPolygon(np.round(interp.exterior.coords))
interp = make_valid(interp)
return interp.exterior.coords[:-1] # keep open

# from ocrd_tesserocr, to be integrated into core (somehow)...
def make_valid(polygon):
for split in range(1, len(polygon.exterior.coords)-1):
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
break
# simplification may not be possible (at all) due to ordering
# in that case, try another starting point
polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
for tolerance in range(1, int(polygon.area)):
if polygon.is_valid:
break
# simplification may require a larger tolerance
polygon = polygon.simplify(tolerance)
return polygon

if __name__ == '__main__':
ocrd_sbb_textline_detector()
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ scikit-learn
numpy == 1.18.* # XXX for tensorflow-gpu 1.15
tensorflow-gpu ~=1.15.2
scipy
ocrd >= 2.0.0
ocrd >= 2.18.0
shapely >= 1.7.1