#!/usr/bin/perl # groff2db 0.5. Copyright (c) Michael Still 2001 and 2002 (mikal@stillhq.com) # Released under the terms of the GNU GPL. Please submit all patches to # mikal@stillhq.com. Initially developed for Greg Lehey's "The Complete # FreeBSD"... Many thanks for his sponsorship and support of this project. # The basic concept of this script to to convert troff marked up input into # docbook compliant XML. It also makes an attempt at being able to convert # man pages to docbook... # Command line arguements: # -t (list of if arguements to consider true) NOT IMPLEMENTED YET # -f (list of if arguements to consider false) NOT IMPLEMENTED YET # -s The starting section number in the document # (for ease of embedding into others) use strict; my($INPUT, $PREPROC, $outputstyle); my($parastate, $liststate, $listitemstate, $footnotestate, $sectionstate, $tablestate, $stylestate, $displaystate, $waitstate, $quotestate, $pausedlist, $tablestate, $tablecolcount, $tablebracestate); my($length); my($temp); my($linecount, $condlinecount, $firstsect, $firstsectnum); my(@inline, @outline, $linemax, $count); my($temp, $prevtemp, $state, $subst); # todo: fix this hack $firstsectnum = $ARGV[1] - 1; if($firstsectnum eq ""){ $firstsectnum = 1; } $linecount = 0; $tablestate = 0; open PREPROC, "> tmp.pp"; ##################################################################### # We setup the conversion rules for many of the translations here... ##################################################################### # Table start $inline[0] = "\.TH[ \t]*(.*)"; $outline[0] = "\$tablestate = 1; \$temp = \"\";"; # Table end $inline[1] = "\.TE"; $outline[1] = "\$tablestate = 0; print PREPROC \"\$temp\n\";"; # Commands which escape table mode (for the benefit of the man page conversion) $inline[2] = "\.SH[ \t]+(.*)"; $outline[2] = "if(\$tablestate == 1){\$tablestate = 0;}"; $linemax = 3; while(){ for($count = 0; $count < $linemax; $count++){ if(/^$inline[$count]/){ eval $outline[$count]; $count = $linemax + 1; } } # This is the inclusion of a shared object, we do this here so it can be # processed normally later... if(/^\.so (.*)\.[0-9].*/i){ $_ = ".P\nSee $1"; } # Blank lines in man pages equate to the .P macro in mm land if(/^[ \t\r]*$/){ $_ = ".P\n"; } # If we are in table state, then we need to rewrite lines for the # table cells if($tablestate == 1){ s/T\{\n//; s/T\}//g; if(/^\./){ $_="\n$_"; } elsif(/\#/){ } else{ chomp; } $temp="$temp $_"; } else{ print PREPROC "$_"; } } close PREPROC; open INPUT, "< tmp.pp"; $liststate = 0; $listitemstate = 0; $parastate = 0; $footnotestate = 0; $sectionstate = $firstsectnum; $tablestate = 0; $displaystate = 0; $stylestate = ""; $quotestate = 0; $tablestate = 0; $pausedlist = 0; $firstsect = 1; $state = ""; # For each line in the file... while(){ $linecount++; # There are some conditional elements used, for instance in xtheory.mm (which is also an # article) if(/^\.if[ \t]+(.*)[ \t]+\.(.*)/){ printf STDERR "If \"$1\" found, consider setting...\n"; $_ = ; } elsif(/^\.\.if (.*)/){ printf STDERR "Compound if \"$1\" found, consider setting...\n"; $waitstate = 1; $condlinecount = 0; while(($waitstate == 1) && ($_ = )){ $linecount++; $condlinecount++; if(/^..else/){ $waitstate = 0; } elsif(/^..endif/){ $waitstate = 0; } } } # The libpng man page is on mighty drugs s/^\\fI\\fB$/.P/; s/\\fP\\fB/\\fB/g; s/\\fP\\fI/\\fI/g; # Man pages also do wierd things with ...'s s/\\&\.\.\./.../g; # Ampersands s/&/&/g; # LT s//>/g; # Mu (micro) doesn't work with db2ps (jade) s/µ/μ/g; # Man pages escape hyphens s/\\-/-/g; # Sometimes there are some other special characters that we deal with s/\\\(ae/æg/; # Footnote thingie we don't use s/\\\*F//g; # Bold italic for a single word (there is no equivalent, so merely italics) s/\\f\(BI([^ \t\"\\]*)/$1<\/emphasis>/ig; # Italic for a single word s/[ \t]*\\\(em[ \t]*/—/ig; # We run some little conversion rules to deal with things like italics # et al $prevtemp = "NOSUCHLINEFOOFOOFOO"; $temp = $_; while($prevtemp ne $temp){ $prevtemp = $temp; $_ = $temp; if(/(.*?)\\f([IBP])(.*)/){ if($2 eq "I"){ $subst = "emphasis"; } elsif($2 eq "B"){ $subst = "command"; } elsif($2 eq "P"){ $subst = ""; } if($subst ne ""){ $temp = "$1$state<$subst>$3"; $state = ""; } else{ $temp = "$1$state$3"; $state = ""; } } } $_ = $temp; ############################################################## # Comments if(/^\.\\\"(.*)/i){ $_ = $1; s/nroff-fill/sgml-fill/g; print "\n"; } # No mapping elsif(/^\.\./){ } # Aside start elsif(/^\.Aside(.*)/i){ endpara(); print "\n"; startpara(); } elsif(/^\.End-aside(.*)/i){ endpara(); print "\n"; startpara(); } # No mapping? elsif(/^\.ad/i){ } # No mapping? elsif(/^\.br/i){ } # No mapping elsif(/^\.ce/i){ } # Defining display strings elsif(/^.ds/){ } # Display start -- is case sensitive elsif((/^\.DS(.*)/) || (/^\.Dx(.*)/i)){ endpara(); print "$state\n"; $displaystate = 1; } # Display end elsif(/^\.DE(.*)/i){ if($displaystate == 1){ print "$state\n"; $displaystate = 0; startpara(); } } # The content of floating displays is handled in with the images themselves # This also squelches the display output command elsif(/^\.DF/i){ } # Ifs elsif(/^\.\.if/i){ } # Else elsif(/^\.\.else/i){ } # Endifs elsif(/^\.\.endif/i){ } # Setting the font family elsif(/^\.fam/i){ # No mapping? } # Set the font elsif(/^\.ft/i){ # No mapping } # Make a few words bold (todo, check mapping makes sense) (man pages) elsif(/^\.SM[ \t]+(.*)/){ print "$1\n"; } # Italic (man pages) elsif(/^\.I[ \t]+(.*)/){ print "$1\n"; } # Bold (man pages) elsif(/^\.B[ \t]+(.*)/){ print "$1\n"; } # References to other pages (man pages) elsif(/^\.IR[ \t]+(.*)/){ print "$1\n"; } # Footnote start elsif(/^\.FS[ \t]*(.*)/i){ print "\n"; startpara(); $footnotestate = 1; } # Footnote end elsif(/^\.FE[ \t]*(.*)/i){ if($footnotestate == 0){ } endpara(); print "\n"; $footnotestate = 0; } # Figure headings elsif(/^\.Figure-heading [\"]*([^\"]*)[\"]*/i){ print "
"; if($1 ne ""){ print "$1"; } print "
\n"; } # Filenames elsif(/^\.File ([ \t]*)(.*)/i){ print "$1$2\n"; } # Headings elsif(/^\.H([0-5])[ \t]*(.*)/i){ print "$state"; endpara(); listend(); # Docbook only goes up to sect5 if($1 > 5){ print STDERR "Warning: docbook sect limit exceeded\n"; } while($sectionstate > ($1 - 2)){ if($firstsect == 1){ $firstsect = 0; } else{ print "\n"; } $sectionstate--; } while($sectionstate < ($1 - 2)){ if($firstsect == 0){ print ""; } } $sectionstate++; if($firstsect == 0){ print ""; } $_ = $2; s/\"//g; if($_ ne ""){ print "$_\n"; } startpara(); } # Headings (Unsupported) elsif(/^\.H([6-9])[ \t]*(.*)/i){ print "$state"; endpara(); print "UNSUPPRTED SECTION LEVEL\n"; $sectionstate = 1; } # Headings (man pages) elsif(/^\.SH[ \t]*(.*)/i){ print "$state"; endpara(); listend(); endpara(); while($sectionstate > $firstsectnum){ print "\n"; $sectionstate--; } $sectionstate++; print ""; $_ = $1; s/\"//g; if($_ ne ""){ print "$_\n"; } startpara(); } # Highlights elsif(/^\.Highlight(.*)/i){ endpara(); print "\n"; startpara(); } elsif(/^\.End-highlight(.*)/i){ endpara(); print "\n"; startpara(); } # Indents (not used very often in Greg's source) elsif(/^\.Indent(.*)/i){ endpara(); print "\n"; startpara(); } elsif(/^\.End-indent(.*)/i){ endpara(); print "\n"; startpara(); } # Another form of indenting (has no mapping) elsif(/^\.in/i){ } # List start elsif((/^\.LS(.*)/i) || (/^\.LB(.*)/i)){ liststart(); } # List item elsif(/^\.LI[ \t]*(.*)/i){ if($liststate == 0){ liststart(); } if($pausedlist == 1){ print "\n"; $pausedlist = 0; } endpara(); if($listitemstate == 1){ print "\n"; } print "\n"; startpara(); $listitemstate = 1; } # List item from a man page elsif(/^\.TP/i){ if($liststate == 0){ liststart(); } if($pausedlist == 1){ print "\n"; $pausedlist = 0; } endpara(); if($listitemstate == 1){ print "\n"; } print "\n"; startpara(); $listitemstate = 1; } # List end elsif(/^\.LE(.*)/i){ endpara(); listend(); } # No mapping elsif(/^\.na/i){ } # Space on page elsif(/^\.ne/i){ # No mapping to docbook? } # No fill elsif(/^\.nf/i){ # No mapping to docbook? } # Set the fill (after a ne?) elsif(/^\.fi/i){ # No mapping to docbook? } # Page link target -- some of Greg's anchors start with numbers, which is why we insert some text elsif(/^\.Pn[ \t]+(.*)/i){ print "\n"; } # Paragraphs elsif(/^\.P[ \t]*([0-9]*)(.*)/i){ print "$state\n"; $state = ""; endpara(); startpara(); } # Pictures elsif(/^\.PIC [\"]*([^\"]*)[\"]*/i){ print "
\n"; } # Quotes elsif(/^\.Quote(.*)/i){ print "\n"; $quotestate = 1; } elsif((/^\.End-quote(.*)/i) && ($quotestate == 1)){ print "\n"; } # Insets are treated as quotes as well elsif(/^\in[ \t]\+(.*)/i){ print "\n"; $quotestate = 1; } elsif((/^\.in/i) && ($quotestate == 1)){ print "\n"; } # Spacing commands don't map to db elsif(/^\.sp(.*)/i){ } # Table title elsif(/^\.TB[ \t]*[\"]*([^\"]*)[\"]*/i){ print ""; if($1 ne "" ){ print "$1\n"; } $tablestate = 1; } # Table start elsif(/^\.TS[ \t]*(.*)/i){ } # Table heading elsif(/^\.TH(.*)/i){ $tablestate = 2; } # Table end elsif(/^\.TE/i){ print "
\n"; $tablestate = 0; } # Table reference for later elsif(/^\.Tn/i){ } # Chapters elsif(/^\.Chapter[ \t]\\\*\[([^ \t]*)\][ \t]\"(.*)\"/){ print ""; if($2 ne ""){ print "$2\n"; } # The introductory text doesn't need to have a sect1 startpara(); $sectionstate = 1; } elsif(/^\.X[ \t]\"*([^ \",]*)[ \t,]*([^,\"]*)\"*/i){ print "$1"; if($2 ne ""){ print "$2"; } print "\n"; } # We don't know anything about this command at all elsif(/^\.(.*)/i){ print STDERR "DEFAULT COMMAND FOUND: $1\n"; } # A document line? else{ # References to chapters and stuff like that s/\\\*\[([^\]]*)\]//g; if($tablestate > 1){ s/\#/<\/entry>/g; } if($tablestate == 2){ $tablecolcount = 1; $temp = $_; while($temp =~ //g){ $tablecolcount++ } print "\n"; $tablestate = 3; } if($tablestate > 1){ print ""; } s/[ \t\n]+$//; # chomp; print "$_\n"; if($tablestate > 1){ print "\n"; } } } endpara(); while($sectionstate > $firstsectnum){ print "
\n"; $sectionstate--; } close INPUT; exit; sub liststart(){ endpara(); if($liststate == 0){ print "\n"; } else{ if($listitemstate > 0){ print "\n"; $listitemstate--; } print "\n"; print "\n"; } $liststate++; } sub listend(){ if($listitemstate > 0){ print "\n"; } if($liststate > 0){ if($liststate == 1){ print "\n"; startpara(); } else{ print "\n"; $pausedlist = 1; } $liststate--; } $listitemstate = 0; } sub endpara(){ if($parastate > 0){ print "\n"; $parastate--; } } sub startpara(){ if($displaystate == 0){ print "\n"; $parastate++; } }