#!/usr/bin/perl
# Conversion of PostgreSQL documentation from Docbook 4.2 sgml-format into
# Docbook 4.2 xml-format.
#
# Based on a script from Jürgen Purtz
#
# The script expands the SGML constructs 'shorttags' and 'empty elements'. Additionally
# it handles one special postgres case.
#
use strict;
use warnings;
use autodie; # die if problem reading or writing a file
# -------------- Input -------------
# read complete STDIN (slurp mode)
my $content = do { local $/; <> };
$content =~ s/ class="PARAMETER"/ class="parameter"/g;
# -------------- Empty (per definition in DTD) elements --------------
# List of 'empty' elements in Docbook. They don't need to have an end tag.
# eg: (there is neither '' nor '/>')
# Close them considering line breaks. Afaik PostgreSQL uses only 'xref', 'co' and 'footnoteref'.
# In addition to the Docbook elements we handle the colspec and spanspec elements of cals tables.
my $emptyElements =
'anchor|area|audiodata|beginpage|co|coref|footnoteref|graphic|imagedata|inlinegraphic|sbr|' .
'textdata|varargs|videodata|void|xref|colspec|spanspec';
# As one of the following steps we use the tool 'osx'. osx tries to close the empty tags again, which results in
# unwanted additional - and in some cases unvalid - CDATA. As long as osx is used we must use the long
# notation of empty elements.
$content =~ s///sg; # some are closed, others not.
# --------------- Shorttags ------------------------
# Prevent replacing tags in comments
$content =~ s//""/sge;
$content =~ s//"!§!sgr).">"/sge;
# Construct an expression which matches tags and the ACCORDING shorttag: ">"
# The idea is to handle the tree of nodes from its leafs to the top with
# one s/...>/.../g command per level.
# Don't use greedy pattern. We must match the nearest >.
# Define the pattern for (multiple) attributes: whitespaces, any string up to > or />
# example:
my $attr = '(\s+(((?!>)(?!/>).)+?))?';
# Define the pattern for shorttags.
my $regex;
$regex = qr/ # regular start-tag. Catch tagname as $1 and attributes as $2
(?'content' # catch content in variable $content
((( # negative look ahead:
(?!) # not a regular start-tag
(?!) # not an empty tag
(?!) # not a regular end-tag
(?!) # not a shorttag
). # move foreward
){0,32000}+ )*+ # to overcome the Perl 32K limit, it's neccessary to split
# $content into many chunks. Possessive quantifiers speeds
# up performance.
) #
() # followed by a shorttag or a regular end-tag
/xs;
# Perform the expansion of shorttags. As of the recursive nature of the node tree, it's necessary
# a) to work with a loop which processes the tree from leaf nodes to root node
# b) to convert the matching shorttags to some form of regular content, which differs
# from SGML/XML-syntax. We use ° and § as they do not occur in the PostgreSQL docs.
# (There is a way to match recursive REs - but not to replace them, afaik.)
# the loop
while ($content =~ s/$regex/°$1$2§$+{content}°\/$1§/sg) {};
# restore the SGML/XML syntax
$content =~ s/°//g;
print $content;