#!/usr/bin/perl
# groff2db 0.2. Copyright (c) Michael Still 2001 (mikal@stillhq.com)
# Released under the terms of the GNU GPL. Please submit all patches to mik al@stillhq.com
# Initially developed for Greg Lehey's "The Complete FreeBSD"... Many thanks for his
# sponsorship and support of this project.
# The basic concept of this script to to convert troff marked up input into
# docbook compliant XML. This script is somewhat Greg specific at the moment, but I would
# love to make it more generalised over time...
# Screen is not entirely appropriate for display mapping
# There are some section references which need to be manually fixed (search for "Section reference" in the SGML
# Underway
# --------
# - Table conversion
# Todo
# ----
# -
et al
# - ensure no blank lines at the end of program listings...
# Unread email
# ------------
# - em2
# - em4
use strict;
my($INPUT, $OUTPUT, $PREPROC, $outputstyle);
my($parastate, $liststate, $listitemstate, $footnotestate, $sectionstate, $tablestate, $stylestate, $displaystate, $waitstate, $quotestate, $pausedlist, $tablestate, $tablecolcount, $tablebracestate);
my($length);
my($chapters, $chapter, $chaptercount, $temp, $inputused);
my($linecount, $condlinecount);
# We need to check the command line arguements...
if($ARGV[0] eq "jade" ){
print "Jade output style selected (one big file).\n";
print "The output will go to output.sgml\n";
$outputstyle = "jade";
}
else{
print "Output style is gmat (many docbook files).\n";
print "The files will have .sgml extensions...\n";
$outputstyle = "gmat";
}
$chapters = "introduction concepts evolution quickinstall shareinstall install xsetup postinstall ports problems xtheory unixref starting filesys disks tapes printers desktop building current emulate netintro netsetup isp modems ppp slip dns firewall netdebug nfs netbasics email www fax micronet mobile";
$chaptercount = 0;
$linecount = 0;
if($outputstyle eq "jade"){
open OUTPUT, "> output.sgml";
}
# Print out the prelude
print OUTPUT "\n";
print OUTPUT "The Complete FreeBSDGregLehey2001Greg Lehey\n";
# For each chapter we specified
foreach $chapter (split(/ /, $chapters)){
if($outputstyle ne "jade"){
open OUTPUT, "> $chapter.sgml";
}
$chaptercount++;
print "Preprocessing chapter $chapter\n";
print " - Table conversion\n";
open INPUT, "< $chapter.mm";
open PREPROC, "> $chapter.pp";
$tablestate = 0;
while(){
# Table start
if(/^\.TH[ \t]*(.*)/i){
$tablestate = 1;
$temp = "";
}
# Table end
elsif(/^\.TE/i){
$tablestate = 0;
print PREPROC "$temp\n";
}
# If we are in table state, then we need to rewrite lines for the table cells
if($tablestate == 1){
##s/T\}\#T\{/\#/g;
s/T\{\n//;
s/T\}//g;
if(/^\./){
$_="\n$_";
}
elsif(/\#/){
}
else{
chomp;
}
$temp="$temp $_";
}
else{
print PREPROC "$_";
}
}
close INPUT;
close PREPROC;
print "Processing chapter $chapter ($chaptercount)\n";
open INPUT, "< $chapter.pp";
$liststate = 0;
$listitemstate = 0;
$parastate = 0;
$footnotestate = 0;
$sectionstate = 0;
$tablestate = 0;
$displaystate = 0;
$stylestate = "";
$quotestate = 0;
$tablestate = 0;
$inputused = 0;
$pausedlist = 0;
# For each line in the file...
while(){
$linecount++;
$inputused = 1;
# There are some conditional elements used, for instance in xtheory.mm (which is also an
# article)
if((/^\.\.if article/i) || (/^\.\.if XXX/i) || (/^\.\.if appropriate/i) || (/^\.\.if complete/i) ||
(/^\.\.if finished/i) || (/^\.\.if fixme/i) || (/^\.\.if isdn/i) || (/^\.\.if itworks/i) || (/^\.\.if long/i) ||
(/^\.\.if needed/i) || (/^\.\.if netnews/i) || (/^\.\.if network/i) || (/^\.\.if partofthebook/i) ||
(/^\.\.if raid5/i) || (/^\.\.if review/i) || (/^\.\.if short/i) || (/^\.\.if sorted/i) || (/^\.\.if verylong/i) ||
(/^\.\.if X/i)){
$waitstate = 1;
$condlinecount = 0;
while(($waitstate == 1) && ($_ = )){
$linecount++;
$condlinecount++;
if(/^..else/){
$waitstate = 0;
}
elsif(/^..endif/){
$waitstate = 0;
}
}
}
# Ampersands
s/&/&/g;
# LT
s/</g;
# GT
s/>/>/g;
# Mu (micro) doesn't work with db2ps (jade)
s/µ/μ/g;
# Sometimes there are some other special characters that we deal with
s/\\\(ae/æg/;
# Footnote thingie we don't use
s/\\\*F//g;
# Bold italic for a single word (there is no equivalent, so merely italics)
s/\\f\(BI([^ \t\"\\]*)/$1<\/emphasis>/ig;
# Italic for a single word
s/[ \t]*\\\(em[ \t]*/—/ig;
# We run some little conversion rules to deal with things like italics et al
$temp = processstyles($displaystate, $_);
$_ = $temp;
##############################################################
# Comments
if(/^\.\\\"(.*)/i){
$_ = $1;
s/nroff-fill/sgml-fill/g;
print OUTPUT "\n";
}
# No mapping
elsif(/^\.\./){
}
# Aside start
elsif(/^\.Aside(.*)/i){
endpara();
print OUTPUT "\n";
startpara();
}
elsif(/^\.End-aside(.*)/i){
endpara();
print OUTPUT "\n";
startpara();
}
# No mapping?
elsif(/^\.ad/i){
}
# No mapping?
elsif(/^\.br/i){
}
# No mapping
elsif(/^\.ce/i){
}
# Defining display strings
elsif(/^.ds/){
}
# Display start -- is case sensitive
elsif((/^\.DS(.*)/) || (/^\.Dx(.*)/i)){
endpara();
print OUTPUT closeallstates();
print OUTPUT "\n";
$displaystate = 1;
}
# Display end
elsif(/^\.DE(.*)/i){
if($displaystate == 1){
print OUTPUT closeallstates();
print OUTPUT "\n";
$displaystate = 0;
startpara();
}
}
# The content of floating displays is handled in with the images themselves
# This also squelches the display output command
elsif(/^\.DF/i){
}
# Ifs
elsif(/^\.\.if/i){
}
# Else
elsif(/^\.\.else/i){
}
# Endifs
elsif(/^\.\.endif/i){
}
# Setting the font family
elsif(/^\.fam/i){
# No mapping?
}
# Set the font
elsif(/^\.ft/i){
# No mapping
}
# Footnote start
elsif(/^\.FS[ \t]*(.*)/i){
print OUTPUT "\n";
startpara();
$footnotestate = 1;
}
# Footnote end
elsif(/^\.FE[ \t]*(.*)/i){
if($footnotestate == 0){
dblog("{DBLOG: Footnote end for a non-open footnote!}");
}
endpara();
print OUTPUT "\n";
$footnotestate = 0;
}
# Figure headings
elsif(/^\.Figure-heading [\"]*([^\"]*)[\"]*/i){
print OUTPUT "";
if($1 ne ""){
print OUTPUT "$1";
}
print OUTPUT "\n";
}
# Filenames
elsif(/^\.File ([ \t]*)(.*)/i){
print OUTPUT "$1$2\n";
}
# Headings
elsif(/^\.H([0-7])[ \t]*(.*)/i){
print OUTPUT closeallstates();
endpara();
# Docbook only goes up to sect5
if($1 > 5){
print "Warning: docbook sect limit exceeded\n";
}
while($sectionstate > ($1 - 2)){
print OUTPUT "\n";
$sectionstate--;
}
while($sectionstate < ($1 - 2)){
$sectionstate++;
print OUTPUT "\n";
}
$sectionstate++;
print OUTPUT "\n";
$_ = $2;
s/\"//g;
if($_ ne ""){
print OUTPUT "$_\n";
}
startpara();
}
# Headings (Unsupported)
elsif(/^\.H([8-9])[ \t]*(.*)/i){
print OUTPUT closeallstates();
endpara();
print OUTPUT "UNSUPPRTED SECTION LEVEL\n";
$sectionstate = 1;
}
# Highlights
elsif(/^\.Highlight(.*)/i){
endpara();
print OUTPUT "\n";
startpara();
}
elsif(/^\.End-highlight(.*)/i){
endpara();
print OUTPUT "\n";
startpara();
}
# Indents (not used very often in Greg's source)
elsif(/^\.Indent(.*)/i){
endpara();
print OUTPUT "\n";
startpara();
}
elsif(/^\.End-indent(.*)/i){
endpara();
print OUTPUT "\n";
startpara();
}
# Another form of indenting (has no mapping)
elsif(/^\.in/i){
}
# List start
elsif((/^\.LS(.*)/i) || (/^\.LB(.*)/i)){
liststart();
}
# List item
elsif(/^\.LI[ \t]*(.*)/i){
if($liststate == 0){
dblog("{DBLOG: List item for a non-open list!}");
liststart();
}
if($pausedlist == 1){
print OUTPUT "\n";
$pausedlist = 0;
}
endpara();
if($listitemstate == 1){
print OUTPUT "\n";
}
print OUTPUT "\n";
startpara();
$listitemstate = 1;
}
# List end
elsif(/^\.LE(.*)/i){
if($liststate == 0){
dblog("{DBLOG: List end for a non-open list!}");
}
endpara();
if($listitemstate > 0){
print OUTPUT "\n";
}
if($liststate == 1){
print OUTPUT "\n";
startpara();
}
else{
print OUTPUT "\n";
$pausedlist = 1;
}
$liststate--;
dblog("{DEBUG DBLOG: listlevel = $liststate}");
$listitemstate = 0;
if($liststate < 0){
dblog("{DBLOG: Negative list state entered}");
}
}
# No mapping
elsif(/^\.na/i){
}
# Space on page
elsif(/^\.ne/i){
# No mapping to docbook?
}
# No fill
elsif(/^\.nf/i){
# No mapping to docbook?
}
# Set the fill (after a ne?)
elsif(/^\.fi/i){
# No mapping to docbook?
}
# Unknown tag!
# elsif(/^\.Pn[ \t]+(.*)/i){
# print "UNKNOWN COMMAND: Pn [Arg $1]\n";
# }
# Paragraphs
elsif(/^\.P[ \t]*([0-9]*)(.*)/i){
endpara();
startpara();
}
# Pictures
elsif(/^\.PIC [\"]*([^\"]*)[\"]*/i){
print OUTPUT "\n";
}
# Quotes
elsif(/^\.Quote(.*)/i){
print OUTPUT "\n";
$quotestate = 1;
}
elsif((/^\.End-quote(.*)/i) && ($quotestate == 1)){
print OUTPUT "\n";
}
elsif(/^.Sref[ \t]*\\\*\[(.*)\]/i){
print OUTPUT "...Section reference $1 (edit manually)...\n";
}
# Insets are treated as quotes as well
elsif(/^\in[ \t]\+(.*)/i){
print OUTPUT "\n";
$quotestate = 1;
}
elsif((/^\.in/i) && ($quotestate == 1)){
print OUTPUT "\n";
}
# Unknown command
# elsif(/^\.Sref[ \t]*(.*)/i){
# print "UNKNOWN COMMAND: Sref [Arg $1]\n";
# }
# Spacing commands don't map to db
elsif(/^\.sp(.*)/i){
}
# I believe that so is the inclusion of a shared object
elsif(/^\.so/i){
}
# Table title
elsif(/^\.TB[ \t]*[\"]*([^\"]*)[\"]*/i){
print OUTPUT "