#!/usr/bin/perl # groff2db 0.2. Copyright (c) Michael Still 2001 (mikal@stillhq.com) # Released under the terms of the GNU GPL. Please submit all patches to mik al@stillhq.com # Initially developed for Greg Lehey's "The Complete FreeBSD"... Many thanks for his # sponsorship and support of this project. # The basic concept of this script to to convert troff marked up input into # docbook compliant XML. This script is somewhat Greg specific at the moment, but I would # love to make it more generalised over time... use strict; my($INPUT, $OUTPUT, $PREPROC, $outputstyle); my($parastate, $liststate, $listitemstate, $footnotestate, $sectionstate, $tablestate, $stylestate, $displaystate, $waitstate, $quotestate, $pausedlist, $tablestate, $tablecolcount, $tablebracestate); my($length); my($chapters, $chapter, $chaptercount, $temp, $inputused); my($linecount, $condlinecount, $firstsect); # We need to check the command line arguements... if($ARGV[0] eq "jade" ){ print "Jade output style selected (one big file).\n"; print "The output will go to output.sgml\n"; $outputstyle = "jade"; } else{ print "Output style is gmat (many docbook files).\n"; print "The files will have .sgml extensions...\n"; $outputstyle = "gmat"; } $chapters = "introduction concepts evolution quickinstall shareinstall install xsetup postinstall ports problems xtheory unixref starting filesys disks tapes printers desktop building current emulate netintro netsetup isp modems ppp slip dns firewall netdebug nfs netbasics email www fax micronet mobile"; $chaptercount = 0; $linecount = 0; if($outputstyle eq "jade"){ open OUTPUT, "> output.sgml"; } # Print out the prelude print OUTPUT "\n"; print OUTPUT "The Complete FreeBSDGregLehey2001Greg Lehey\n"; # For each chapter we specified foreach $chapter (split(/ /, $chapters)){ if($outputstyle ne "jade"){ open OUTPUT, "> $chapter.sgml"; } $chaptercount++; print "Preprocessing chapter $chapter\n"; print " - Table conversion\n"; open INPUT, "< $chapter.mm"; open PREPROC, "> $chapter.pp"; $tablestate = 0; while(){ # Table start if(/^\.TH[ \t]*(.*)/i){ $tablestate = 1; $temp = ""; } # Table end elsif(/^\.TE/i){ $tablestate = 0; print PREPROC "$temp\n"; } # If we are in table state, then we need to rewrite lines for the table cells if($tablestate == 1){ s/T\{\n//; s/T\}//g; if(/^\./){ $_="\n$_"; } elsif(/\#/){ } else{ chomp; } $temp="$temp $_"; } else{ print PREPROC "$_"; } } close INPUT; close PREPROC; print "Processing chapter $chapter ($chaptercount)\n"; open INPUT, "< $chapter.pp"; $liststate = 0; $listitemstate = 0; $parastate = 0; $footnotestate = 0; $sectionstate = 0; $tablestate = 0; $displaystate = 0; $stylestate = ""; $quotestate = 0; $tablestate = 0; $inputused = 0; $pausedlist = 0; $firstsect = 1; # For each line in the file... while(){ $linecount++; $inputused = 1; # There are some conditional elements used, for instance in xtheory.mm (which is also an # article) if((/^\.\.if article/i) || (/^\.\.if XXX/i) || (/^\.\.if appropriate/i) || (/^\.\.if complete/i) || (/^\.\.if finished/i) || (/^\.\.if fixme/i) || (/^\.\.if isdn/i) || (/^\.\.if itworks/i) || (/^\.\.if long/i) || (/^\.\.if needed/i) || (/^\.\.if netnews/i) || (/^\.\.if network/i) || (/^\.\.if partofthebook/i) || (/^\.\.if raid5/i) || (/^\.\.if review/i) || (/^\.\.if short/i) || (/^\.\.if sorted/i) || (/^\.\.if verylong/i) || (/^\.\.if X/i)){ $waitstate = 1; $condlinecount = 0; while(($waitstate == 1) && ($_ = )){ $linecount++; $condlinecount++; if(/^..else/){ $waitstate = 0; } elsif(/^..endif/){ $waitstate = 0; } } } # Ampersands s/&/&/g; # LT s//>/g; # Mu (micro) doesn't work with db2ps (jade) s/µ/μ/g; # Sometimes there are some other special characters that we deal with s/\\\(ae/æg/; # Footnote thingie we don't use s/\\\*F//g; # Bold italic for a single word (there is no equivalent, so merely italics) s/\\f\(BI([^ \t\"\\]*)/$1<\/emphasis>/ig; # Italic for a single word s/[ \t]*\\\(em[ \t]*/—/ig; # We run some little conversion rules to deal with things like italics et al $temp = processstyles($displaystate, $_); $_ = $temp; ############################################################## # Comments if(/^\.\\\"(.*)/i){ $_ = $1; s/nroff-fill/sgml-fill/g; print OUTPUT "\n"; } # No mapping elsif(/^\.\./){ } # Aside start elsif(/^\.Aside(.*)/i){ endpara(); print OUTPUT "\n"; startpara(); } elsif(/^\.End-aside(.*)/i){ endpara(); print OUTPUT "\n"; startpara(); } # No mapping? elsif(/^\.ad/i){ } # No mapping? elsif(/^\.br/i){ } # No mapping elsif(/^\.ce/i){ } # Defining display strings elsif(/^.ds/){ } # Display start -- is case sensitive elsif((/^\.DS(.*)/) || (/^\.Dx(.*)/i)){ endpara(); print OUTPUT closeallstates(); print OUTPUT "\n"; $displaystate = 1; } # Display end elsif(/^\.DE(.*)/i){ if($displaystate == 1){ print OUTPUT closeallstates(); print OUTPUT "\n"; $displaystate = 0; startpara(); } } # The content of floating displays is handled in with the images themselves # This also squelches the display output command elsif(/^\.DF/i){ } # Ifs elsif(/^\.\.if/i){ } # Else elsif(/^\.\.else/i){ } # Endifs elsif(/^\.\.endif/i){ } # Setting the font family elsif(/^\.fam/i){ # No mapping? } # Set the font elsif(/^\.ft/i){ # No mapping } # Footnote start elsif(/^\.FS[ \t]*(.*)/i){ print OUTPUT "\n"; startpara(); $footnotestate = 1; } # Footnote end elsif(/^\.FE[ \t]*(.*)/i){ if($footnotestate == 0){ dblog("{DBLOG: Footnote end for a non-open footnote!}"); } endpara(); print OUTPUT "\n"; $footnotestate = 0; } # Figure headings elsif(/^\.Figure-heading [\"]*([^\"]*)[\"]*/i){ print OUTPUT "
"; if($1 ne ""){ print OUTPUT "$1"; } print OUTPUT "
\n"; } # Filenames elsif(/^\.File ([ \t]*)(.*)/i){ print OUTPUT "$1$2\n"; } # Headings elsif(/^\.H([0-7])[ \t]*(.*)/i){ print OUTPUT closeallstates(); endpara(); # Docbook only goes up to sect5 if($1 > 5){ print "Warning: docbook sect limit exceeded\n"; } while($sectionstate > ($1 - 2)){ if($firstsect == 1){ $firstsect = 0; } else{ print OUTPUT "\n"; } $sectionstate--; } while($sectionstate < ($1 - 2)){ $sectionstate++; if($firstsect == 0){ print OUTPUT "\n"; } } $sectionstate++; if($firstsect == 0){ print OUTPUT "\n"; } $_ = $2; s/\"//g; if($_ ne ""){ print OUTPUT "$_\n"; } startpara(); } # Headings (Unsupported) elsif(/^\.H([8-9])[ \t]*(.*)/i){ print OUTPUT closeallstates(); endpara(); print OUTPUT "UNSUPPRTED SECTION LEVEL\n"; $sectionstate = 1; } # Highlights elsif(/^\.Highlight(.*)/i){ endpara(); print OUTPUT "\n"; startpara(); } elsif(/^\.End-highlight(.*)/i){ endpara(); print OUTPUT "\n"; startpara(); } # Indents (not used very often in Greg's source) elsif(/^\.Indent(.*)/i){ endpara(); print OUTPUT "\n"; startpara(); } elsif(/^\.End-indent(.*)/i){ endpara(); print OUTPUT "\n"; startpara(); } # Another form of indenting (has no mapping) elsif(/^\.in/i){ } # List start elsif((/^\.LS(.*)/i) || (/^\.LB(.*)/i)){ liststart(); } # List item elsif(/^\.LI[ \t]*(.*)/i){ if($liststate == 0){ dblog("{DBLOG: List item for a non-open list!}"); liststart(); } if($pausedlist == 1){ print OUTPUT "\n"; $pausedlist = 0; } endpara(); if($listitemstate == 1){ print OUTPUT "\n"; } print OUTPUT "\n"; startpara(); $listitemstate = 1; } # List end elsif(/^\.LE(.*)/i){ if($liststate == 0){ dblog("{DBLOG: List end for a non-open list!}"); } endpara(); if($listitemstate > 0){ print OUTPUT "\n"; } if($liststate == 1){ print OUTPUT "\n"; startpara(); } else{ print OUTPUT "\n"; $pausedlist = 1; } $liststate--; dblog("{DEBUG DBLOG: listlevel = $liststate}"); $listitemstate = 0; if($liststate < 0){ dblog("{DBLOG: Negative list state entered}"); } } # No mapping elsif(/^\.na/i){ } # Space on page elsif(/^\.ne/i){ # No mapping to docbook? } # No fill elsif(/^\.nf/i){ # No mapping to docbook? } # Set the fill (after a ne?) elsif(/^\.fi/i){ # No mapping to docbook? } # Page link target -- some of Greg's anchors start with numbers, which is why we insert some text elsif(/^\.Pn[ \t]+(.*)/i){ print OUTPUT "\n"; } # Paragraphs elsif(/^\.P[ \t]*([0-9]*)(.*)/i){ endpara(); startpara(); } # Pictures elsif(/^\.PIC [\"]*([^\"]*)[\"]*/i){ print OUTPUT "
\n"; } # Quotes elsif(/^\.Quote(.*)/i){ print OUTPUT "\n"; $quotestate = 1; } elsif((/^\.End-quote(.*)/i) && ($quotestate == 1)){ print OUTPUT "\n"; } # elsif(/^.Sref[ \t]*\\\*\[(.*)\]/i){ # print OUTPUT "...Section reference $1 (edit manually)...\n"; # } # Insets are treated as quotes as well elsif(/^\in[ \t]\+(.*)/i){ print OUTPUT "\n"; $quotestate = 1; } elsif((/^\.in/i) && ($quotestate == 1)){ print OUTPUT "\n"; } # Spacing commands don't map to db elsif(/^\.sp(.*)/i){ } # I believe that so is the inclusion of a shared object elsif(/^\.so/i){ } # Table title elsif(/^\.TB[ \t]*[\"]*([^\"]*)[\"]*/i){ print OUTPUT ""; if($1 ne "" ){ print OUTPUT "$1\n"; } $tablestate = 1; } # Table start elsif(/^\.TS[ \t]*(.*)/i){ } # Table heading elsif(/^\.TH(.*)/i){ $tablestate = 2; } # Table end elsif(/^\.TE/i){ print OUTPUT "
\n"; $tablestate = 0; } # Table reference for later elsif(/^\.Tn/i){ } # Chapters elsif(/^\.Chapter[ \t]\\\*\[([^ \t]*)\][ \t]\"(.*)\"/){ print OUTPUT ""; if($2 ne ""){ print OUTPUT "$2\n"; } # The introductory text doesn't need to have a sect1 startpara(); $sectionstate = 1; } elsif(/^\.X[ \t]\"*([^ \",]*)[ \t,]*([^,\"]*)\"*/i){ print OUTPUT "$1"; if($2 ne ""){ print OUTPUT "$2"; } print OUTPUT "\n"; } # We don't know anything about this command at all elsif(/^\.(.*)/i){ print "DEFAULT COMMAND FOUND: $1\n"; } # A document line? else{ # References to chapters and stuff like that s/\\\*\[([^\]]*)\]//g; if($tablestate > 1){ s/\#/<\/entry>/g; } if($tablestate == 2){ $tablecolcount = 1; $temp = $_; while($temp =~ //g){ $tablecolcount++ } print OUTPUT "\n"; $tablestate = 3; } if($tablestate > 1){ print OUTPUT ""; } s/[ \t\n]+$//; # chomp; print OUTPUT "$_\n"; if($tablestate > 1){ print OUTPUT "\n"; } } } endpara(); if($liststate == 1){ dblog("{DBLOG: Not closed list at end of document!}"); } if($footnotestate == 1){ dblog("{DBLOG: Not closed footnote at end of document!}"); } while($sectionstate > 0){ print OUTPUT "
\n"; $sectionstate--; } # Some input files are not used because they don't exist! if($inputused > 0){ print OUTPUT "\n"; } else{ print " File was empty\n"; } close INPUT; if($outputstyle ne "jade"){ close OUTPUT; } } print "Processed $chaptercount chapters\n"; print OUTPUT "
\n"; exit; sub liststart(){ endpara(); if($liststate == 0){ print OUTPUT "\n"; } else{ if($listitemstate > 0){ print OUTPUT "\n"; $listitemstate--; } print OUTPUT "\n"; print OUTPUT "\n"; } $liststate++; } sub endpara(){ if($parastate > 0){ print OUTPUT "\n"; $parastate--; } } sub startpara(){ if($displaystate == 0){ print OUTPUT "\n"; $parastate++; } } sub processstyles(){ my($displaystate, $line) = @_; my($newline, $word, $retStr, $length); $newline = ""; # It turns out that a whole bunch of these are actually font change # commands... foreach $word (split(/ /, $line)){ $length = 0; while($length != length($word)){ $length = length($word); $_ = $word; if(/(.*)\\fB(.*)/){ $word = closeallstates() . "$1$2"; $stylestate = "$stylestate" . "B"; } $_ = $word; if(/(.*)\\fI(.*)/){ $word = "$1$2"; $stylestate = "$stylestate" . "I"; } $_ = $word; if(/(.*)\\fC(.*)/){ $word = closeallstates() . "$1$2"; $stylestate = "$stylestate" . "c"; } $_ = $word; if(/(.*)\\f\(CW(.*)/){ $word = "$1$2"; $stylestate = "$stylestate" . "W"; } $_ = $word; if(/(.*)\\f\(CB(.*)/){ $word = closeallstates() . "$1$2"; $stylestate = "$stylestate" . "b"; } $_ = $word; if(/(.*)\\f([RP])(.*)/){ dblog("{$stylestate}"); if($2 eq "R"){ $word = "$1" . closeallstates() . "$3"; } else{ $retStr = closestate(substr($stylestate, length($stylestate) - 1, 1)); if($1 eq ""){ $word = "$retStr$3"; } else{ $word = "$1$retStr$3"; } $stylestate =~ s/[A-Za-z]$//; } } # Horizontal spacing (no mapping to docbook) $_ = $word; s/\\\|//g; $word = $_; } $newline = "$newline$word "; } return $newline; } sub closeallstates(){ my($retStr, $count); $retStr = ""; for($count = length($stylestate) - 1; $count > -1; $count--){ $retStr = $retStr . closestate(substr($stylestate, $count, 1)); } $stylestate = ""; return $retStr; } sub closestate(){ my($state) = @_; if($state eq "I"){ return "<\/emphasis>"; } elsif($state eq "B"){ return "<\/keycap>"; } elsif($state eq "b"){ return "<\/command>"; } elsif($state eq "W"){ return "<\/emphasis>"; } elsif($state eq "c"){ return "<\/command>"; } else{ return ""; } } sub dblog(){ my($string) = @_; # print OUTPUT $string; }